Open In Colab

Importing datasets¶

In [ ]:
import pandas as pd

customers = pd.read_csv('olist_customers_dataset.csv')
customers.head()
Out[ ]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP
In [ ]:
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
geolocation.head()
Out[ ]:
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
0 1037 -23.545621 -46.639292 sao paulo SP
1 1046 -23.546081 -46.644820 sao paulo SP
2 1046 -23.546129 -46.642951 sao paulo SP
3 1041 -23.544392 -46.639499 sao paulo SP
4 1035 -23.541578 -46.641607 sao paulo SP
In [ ]:
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_items.head()
Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 2017-09-19 09:45:35 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 2017-05-03 11:05:13 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 2018-01-18 14:48:30 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 2018-08-15 10:10:18 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 2017-02-13 13:57:51 199.90 18.14
In [ ]:
order_payments = pd.read_csv('olist_order_payments_dataset.csv')
order_payments.head()
Out[ ]:
order_id payment_sequential payment_type payment_installments payment_value
0 b81ef226f3fe1789b1e8b2acac839d17 1 credit_card 8 99.33
1 a9810da82917af2d9aefd1278f1dcfa0 1 credit_card 1 24.39
2 25e8ea4e93396b6fa0d3dd708e76c1bd 1 credit_card 1 65.71
3 ba78997921bbcdc1373bb41e913ab953 1 credit_card 8 107.78
4 42fdf880ba16b47b59251dd489d4441a 1 credit_card 2 128.45
In [ ]:
reviews = pd.read_csv('olist_order_reviews_dataset.csv')
reviews.head()
Out[ ]:
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 NaN NaN 2018-01-18 00:00:00 2018-01-18 21:46:59
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 NaN NaN 2018-03-10 00:00:00 2018-03-11 03:05:13
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 NaN NaN 2018-02-17 00:00:00 2018-02-18 14:36:24
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 NaN Recebi bem antes do prazo estipulado. 2017-04-21 00:00:00 2017-04-21 22:02:06
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 NaN Parabéns lojas lannister adorei comprar pela I... 2018-03-01 00:00:00 2018-03-02 10:26:53
In [ ]:
orders = pd.read_csv('olist_orders_dataset.csv')
orders.head()
Out[ ]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
In [ ]:
products = pd.read_csv('olist_products_dataset.csv')
products.head()
Out[ ]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40.0 287.0 1.0 225.0 16.0 10.0 14.0
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44.0 276.0 1.0 1000.0 30.0 18.0 20.0
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46.0 250.0 1.0 154.0 18.0 9.0 15.0
3 cef67bcfe19066a932b7673e239eb23d bebes 27.0 261.0 1.0 371.0 26.0 4.0 26.0
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37.0 402.0 4.0 625.0 20.0 17.0 13.0
In [ ]:
sellers = pd.read_csv('olist_sellers_dataset.csv')
sellers.head()
Out[ ]:
seller_id seller_zip_code_prefix seller_city seller_state
0 3442f8959a84dea7ee197c632cb2df15 13023 campinas SP
1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 mogi guacu SP
2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 rio de janeiro RJ
3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 sao paulo SP
4 51a04a8a6bdcb23deccc82b0b80742cf 12914 braganca paulista SP
In [ ]:
product_names = pd.read_csv('product_category_name_translation.csv')
product_names.head()
Out[ ]:
product_category_name product_category_name_english
0 beleza_saude health_beauty
1 informatica_acessorios computers_accessories
2 automotivo auto
3 cama_mesa_banho bed_bath_table
4 moveis_decoracao furniture_decor
In [ ]:

In [ ]:
# @title Setting style for charts
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import logging

# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress matplotlib font-related and other warnings from logging
logging.getLogger('matplotlib').setLevel(logging.ERROR)

# Set "serif" as the global font family with specific styling parameters
mpl.rcParams.update({
    "font.family": "serif",
    "font.serif": ["Liberation Serif"],  # Define fallback fonts if Times New Roman is unavailable
    "font.weight": "normal",
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

# Set a Viridis color palette
viridis_colors = plt.cm.viridis(np.linspace(0, 1, 10))

Inspecting datasets, columns, unique and missing values¶

In [ ]:
# @title Dataset Info
datasets = {
    'customers': customers,
    'geolocation': geolocation,
    'order_items': order_items,
    'order_payments': order_payments,
    'reviews': reviews,
    'orders': orders,
    'products': products,
    'sellers': sellers,
    'product_names': product_names
}

for dataset_names, dataset in datasets.items():
    print('Dataset name: ', dataset_names)
    print(f'Shape of dataset: {dataset.shape}')
    column_info = []
    for column in dataset.columns:
        col_name = column
        dtype = dataset[column].dtype
        unique_values = dataset[column].nunique()
        missing_vals = dataset[column].isnull().sum()

        column_info.append(
            {'Column name': col_name,
             'Data type': dtype,
             'Unique values': unique_values,
             'Missing values': missing_vals
             })

    column_info_df = pd.DataFrame(column_info)
    print(column_info_df)
    print('------------------------------------------------------------------------')
Dataset name:  customers
Shape of dataset: (99441, 5)
                Column name Data type  Unique values  Missing values
0               customer_id    object          99441               0
1        customer_unique_id    object          96096               0
2  customer_zip_code_prefix     int64          14994               0
3             customer_city    object           4119               0
4            customer_state    object             27               0
------------------------------------------------------------------------
Dataset name:  geolocation
Shape of dataset: (1000163, 5)
                   Column name Data type  Unique values  Missing values
0  geolocation_zip_code_prefix     int64          19015               0
1              geolocation_lat   float64         717360               0
2              geolocation_lng   float64         717613               0
3             geolocation_city    object           8011               0
4            geolocation_state    object             27               0
------------------------------------------------------------------------
Dataset name:  order_items
Shape of dataset: (112650, 7)
           Column name Data type  Unique values  Missing values
0             order_id    object          98666               0
1        order_item_id     int64             21               0
2           product_id    object          32951               0
3            seller_id    object           3095               0
4  shipping_limit_date    object          93318               0
5                price   float64           5968               0
6        freight_value   float64           6999               0
------------------------------------------------------------------------
Dataset name:  order_payments
Shape of dataset: (103886, 5)
            Column name Data type  Unique values  Missing values
0              order_id    object          99440               0
1    payment_sequential     int64             29               0
2          payment_type    object              5               0
3  payment_installments     int64             24               0
4         payment_value   float64          29077               0
------------------------------------------------------------------------
Dataset name:  reviews
Shape of dataset: (99224, 7)
               Column name Data type  Unique values  Missing values
0                review_id    object          98410               0
1                 order_id    object          98673               0
2             review_score     int64              5               0
3     review_comment_title    object           4527           87656
4   review_comment_message    object          36159           58247
5     review_creation_date    object            636               0
6  review_answer_timestamp    object          98248               0
------------------------------------------------------------------------
Dataset name:  orders
Shape of dataset: (99441, 8)
                     Column name Data type  Unique values  Missing values
0                       order_id    object          99441               0
1                    customer_id    object          99441               0
2                   order_status    object              8               0
3       order_purchase_timestamp    object          98875               0
4              order_approved_at    object          90733             160
5   order_delivered_carrier_date    object          81018            1783
6  order_delivered_customer_date    object          95664            2965
7  order_estimated_delivery_date    object            459               0
------------------------------------------------------------------------
Dataset name:  products
Shape of dataset: (32951, 9)
                  Column name Data type  Unique values  Missing values
0                  product_id    object          32951               0
1       product_category_name    object             73             610
2         product_name_lenght   float64             66             610
3  product_description_lenght   float64           2960             610
4          product_photos_qty   float64             19             610
5            product_weight_g   float64           2204               2
6           product_length_cm   float64             99               2
7           product_height_cm   float64            102               2
8            product_width_cm   float64             95               2
------------------------------------------------------------------------
Dataset name:  sellers
Shape of dataset: (3095, 4)
              Column name Data type  Unique values  Missing values
0               seller_id    object           3095               0
1  seller_zip_code_prefix     int64           2246               0
2             seller_city    object            611               0
3            seller_state    object             23               0
------------------------------------------------------------------------
Dataset name:  product_names
Shape of dataset: (71, 2)
                     Column name Data type  Unique values  Missing values
0          product_category_name    object             71               0
1  product_category_name_english    object             71               0
------------------------------------------------------------------------

First glimpse: Visualizing the data¶

In [ ]:
# @title Sellers by Count of Orders Filled
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Calculate order counts for each seller
value_counts = order_items['seller_id'].value_counts()

# Define bins and labels for categorizing sellers
bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100, 500, 1000, float('inf')]
labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '10-15', '15-20', '20-30', '30-40', '40-50', '50-75', '75-100', '100-500', '500-1000', '>1000']

# Categorize sellers based on order count
df_cut = pd.cut(value_counts, bins=bins, labels=labels)
seller_value_counts = df_cut.value_counts().sort_index()

# Set global font and styling
mpl.rcParams.update({
    "font.family": "serif",
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

# Plotting with customized style
plt.figure(figsize=(10, 6))
seller_value_counts.plot(kind='bar', color=plt.cm.viridis(0.6))  # Use Viridis color

# Customize title and labels
plt.title('Number of Sellers Based on the Count of Orders Filled', fontsize=16)
plt.xlabel('Number of Orders', fontsize=12)
plt.ylabel('Number of Sellers', fontsize=12)

# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

plt.show()
No description has been provided for this image
In [ ]:
# @title Total Orders per Month
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

# Set date columns to datetime
date_columns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in date_columns:
    orders[col] = pd.to_datetime(orders[col])

# Extract year and month, exclude months from 2016
orders['year_month'] = orders['order_purchase_timestamp'].dt.to_period('M')
orders = orders[orders['order_purchase_timestamp'].dt.year > 2016]

# Group orders by month
grouped = orders.groupby('year_month').size().reset_index(name='order_count')

# Set font and style globally
mpl.rcParams.update({
    "font.family": "serif",
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

# Plotting with customized style
plt.figure(figsize=(8, 4))
sns.barplot(data=grouped, x='year_month', y='order_count', color=plt.cm.viridis(0.6))

# Customize title and labels
plt.title('Number of Orders per Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Orders', fontsize=12)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

plt.show()
No description has been provided for this image
In [ ]:
# @title Distribution of Total Delivery Times
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

# Suppress warnings
pd.options.mode.chained_assignment = None

# Step 1: Calculate delivery time using .loc to avoid SettingWithCopyWarning
orders.loc[:, 'delivery_time'] = orders['order_delivered_customer_date'] - orders['order_purchase_timestamp']

# Step 2: Convert delivery times to a numeric format (e.g. days) using .loc
orders.loc[:, 'delivery_time_days'] = orders['delivery_time'].dt.total_seconds() / (24 * 3600)

# Set font and style globally
mpl.rcParams.update({
    "font.family": "serif",
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

# Plotting with customized style
plt.figure(figsize=(8, 4))
sns.histplot(data=orders, x='delivery_time_days', kde=True, color=plt.cm.viridis(0.5))

# Customize title and labels
plt.title('Distribution of Delivery Time', fontsize=16)
plt.xlabel('Delivery Time (days)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

plt.show()
No description has been provided for this image

Calculating delivery times¶

In [ ]:
import pandas as pd

#create a copy for calculations
orders = orders.copy()

# calculate delivery performance metrics
orders['order_processing_time'] = orders['order_approved_at'] - orders['order_purchase_timestamp']
orders['carrier_delivery_time'] = orders['order_delivered_carrier_date'] - orders['order_approved_at']
orders['customer_delivery_time'] = orders['order_delivered_customer_date'] - orders['order_delivered_carrier_date']
orders['time_to_estimate_delivery'] = orders['order_delivered_customer_date'] - orders['order_estimated_delivery_date']

# convert these timedeltas to total days for easier interpretation/ plotting
orders['order_processing_time_days'] = orders['order_processing_time'].dt.total_seconds() / (24 * 3600)
orders['carrier_delivery_time_days'] = orders['carrier_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['customer_delivery_time_days'] = orders['customer_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['time_to_estimate_delivery_days'] = orders['time_to_estimate_delivery'].dt.total_seconds() / (24 * 3600)

# view the metrics from the new DataFrame
orders[['order_id',
        'order_processing_time', 'carrier_delivery_time', 'customer_delivery_time',
        'time_to_estimate_delivery', 'delivery_time',
        'order_processing_time_days', 'carrier_delivery_time_days',
        'customer_delivery_time_days', 'time_to_estimate_delivery_days',
        'delivery_time_days']].head()
Out[ ]:
order_id order_processing_time carrier_delivery_time customer_delivery_time time_to_estimate_delivery delivery_time order_processing_time_days carrier_delivery_time_days customer_delivery_time_days time_to_estimate_delivery_days delivery_time_days
0 e481f51cbdc54678b7cc49136f2d6af7 0 days 00:10:42 2 days 08:47:45 6 days 01:30:13 -8 days +21:25:13 8 days 10:28:40 0.007431 2.366493 6.062650 -7.107488 8.436574
1 53cdb2fc8bc7dce0b6741e2150273451 1 days 06:42:50 0 days 11:06:33 12 days 00:56:45 -6 days +15:27:45 13 days 18:46:08 1.279745 0.462882 12.039410 -5.355729 13.782037
2 47770eb9100c2d0c44946d9cf07ec65d 0 days 00:16:34 0 days 04:54:37 9 days 04:16:29 -18 days +18:06:29 9 days 09:27:40 0.011505 0.204595 9.178113 -17.245498 9.394213
3 949d5b44dbf5de918fe9c16f97b45f8a 0 days 00:17:53 3 days 17:54:00 9 days 10:48:43 -13 days +00:28:42 13 days 05:00:36 0.012419 3.745833 9.450498 -12.980069 13.208750
4 ad21c59c0840e6cb83a9ceb5573f8159 0 days 01:01:50 0 days 21:26:05 1 days 22:30:28 -10 days +18:17:02 2 days 20:58:23 0.042940 0.893113 1.937824 -9.238171 2.873877
In [ ]:
# PREPROCESSING TIME
# we can observe that preprocessing time of orders constitute a small part of total delivery time in most cases;
# nevertheless, there are outliers and for some orders the processing time can constitute as much as 60% of total delivery time

# CARRIER TAKEOVER TIME
# we can observe that the time from order processing completion to when the carrier assumes responsibility for the delivery ranges between approx. 18% and 36%;
# similarly, there are outliers with this time constituting more than 60% of total delivery time

# CARRIER DELIVERY TIME (TRANSIT TIME)
# carrier delivery constitutes the largest part of total delivery time - on average approx. 60% - 80%;
# however, there are outliers below 30% of total delivery time
In [ ]:
# order delivery time distribution is affected by:

# 1) customer profile - city, zip code (geolocation)
# 2) seller profile - city, zip code (geolocation)

# ---> use the geolocation zip code prefix to match latitude and longitude for each customer and seller

# we can look into other derived attributes:

# for customers:
# 1) the number of orders filled
# 2) the average order size
# 3) the payment type (e.g. credit card)
# 4) the number of payment installements
# 5) products purchased from various sellers (by chance or a particular choice?) (i.e., the number of sellers purchased from)
# 6) customer relationship duration (how long a customer has been ordering from a particular seller)

# for sellers:
# 1) the number of customers/ orders processed
# 2) the length of operation (are there new sellers entering the market or old sellers exiting?)
# 3) customer profiling (a high value customer?)
In [ ]:
# for each order we can look at the contribution of different delivery times
orders['processing_share'] = (orders['order_processing_time_days'] / orders['delivery_time_days'])
orders['carrier_share']= (orders['carrier_delivery_time_days'] / orders['delivery_time_days'])
orders['customer_share'] = (orders['customer_delivery_time_days'] / orders['delivery_time_days'])

orders[['order_id', 'processing_share', 'carrier_share', 'customer_share']].head()
Out[ ]:
order_id processing_share carrier_share customer_share
0 e481f51cbdc54678b7cc49136f2d6af7 0.000881 0.280504 0.718615
1 53cdb2fc8bc7dce0b6741e2150273451 0.092856 0.033586 0.873558
2 47770eb9100c2d0c44946d9cf07ec65d 0.001225 0.021779 0.976997
3 949d5b44dbf5de918fe9c16f97b45f8a 0.000940 0.283587 0.715473
4 ad21c59c0840e6cb83a9ceb5573f8159 0.014941 0.310770 0.674289
In [ ]:
# for each order we can look at the contribution of different delivery times
# from order_items merge sellers_id to orders dataset
orders_merged = pd.merge(orders, order_items[['order_id', 'seller_id']], on='order_id', how='left')

# box plots of delivery time contributions
cols_to_plot = ['processing_share', 'carrier_share', 'customer_share']

grouped = orders_merged.groupby('seller_id')[cols_to_plot].mean()

# melt dataframe to long format for easier plotting
melted_orders = grouped.melt(value_vars=cols_to_plot, var_name='Delivery Metric', value_name='Share (%) in total delivery time')

# create boxplots
sns.catplot(data=melted_orders, x='Delivery Metric', y='Share (%) in total delivery time', kind='box', height=6, aspect=2)

plt.title('Box Plots of Delivery Times')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.show()
No description has been provided for this image
In [ ]:
cols_to_plot = ['order_processing_time_days', 'carrier_delivery_time_days', 'customer_delivery_time_days', 'time_to_estimate_delivery_days', 'delivery_time_days']

# use sellers ids to groupby the mean delivery times
grouped = orders_merged.groupby('seller_id')[cols_to_plot].mean().reset_index()
grouped

# produce scatter pairplots of different delivery times and the distributions with kde
sns.pairplot(grouped[cols_to_plot], diag_kind='kde')
plt.show()
No description has been provided for this image

Data cleaning¶

Removing negative delivery times

In [ ]:
# we observe that there are order processing times up to one month (which could be outliers)
# more importantly, there are negative values of carrier delivery time (therefore also in customer and total delivery times) - these errors could be removed if constitute a small part of dataset
# estimated delivery time is skewed towards faster than estimated deliveries

# count the number of records that have either negative processing, carrier delivery, customer delivery or total delivery times
print('Length of dataframe before: ', len(orders))
negative_processing = (orders['order_processing_time_days'] < 0).sum()
negative_carrier = (orders['carrier_delivery_time_days'] < 0).sum()
negative_customer = (orders['customer_delivery_time_days'] < 0).sum()
negative_total = (orders['delivery_time_days'] < 0).sum()

print('Number of negative processing times: ', negative_processing)
print('Number of negative carrier delivery times: ', negative_carrier)
print('Number of negative customer delivery times: ', negative_customer)
print('Number of negative total delivery times: ', negative_total)

# remove records from orders that have negative processing, carrier, customer or total delivery times
orders = orders[orders['order_processing_time_days'] >= 0]
orders = orders[orders['carrier_delivery_time_days'] >= 0]
orders = orders[orders['customer_delivery_time_days'] >= 0]
orders = orders[orders['delivery_time_days'] >= 0]

print('Length of dataframe after: ', len(orders))
Length of dataframe before:  99112
Number of negative processing times:  0
Number of negative carrier delivery times:  1359
Number of negative customer delivery times:  19
Number of negative total delivery times:  0
Length of dataframe after:  94820

Merging datasets¶

In [ ]:
print('Dataset shape: ', orders.shape)
Dataset shape:  (94820, 22)

Adding customer geolocation data to customers table¶

In [ ]:
# to the orders dataset:
# 1) need to merge customers geolocation
# 2) need to merge sellers geolocation
In [ ]:
# geolocation contains more than 1M records; the same zip code has several lat/long values (could be different streets, buildings, etc.)
# however, these exact locations don't match to customers/ sellers exactly
# when matching, we take the first occurence of latitude, longitude for each zip code
geolocation_unique = geolocation.drop_duplicates(subset='geolocation_zip_code_prefix')

# merge geolocation data with customers
customers_geo = pd.merge(customers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
                         left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
# rename customers_geo columns with prefixes 'c_'
customers_geo = customers_geo.rename(columns={'geolocation_lat': 'c_geolocation_lat', 'geolocation_lng': 'c_geolocation_lng'})

Adding sellers geolocation data to sellers table¶

In [ ]:
# merge geolocation data with sellers
sellers_geo = pd.merge(sellers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
                         left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')

# rename sellers_geo columns with prefixes 's_'
sellers_geo = sellers_geo.rename(columns={'geolocation_lat': 's_geolocation_lat', 'geolocation_lng': 's_geolocation_lng'})

Adding order item info (e.g. product id, seller id, price, freight value) to orders table¶

In [ ]:
# to orders dataset add product_id, seller_id and price and freight_value from order_items
orders_merge_items = pd.merge(orders, order_items[['order_id', 'product_id', 'seller_id', 'price', 'freight_value']], on='order_id', how='left')
In [ ]:
print('Length of the dataframes: ')
print('orders: ', len(orders))
print('orders_merge_items: ', len(orders_merge_items))
Length of the dataframes: 
orders:  94820
orders_merge_items:  108270
In [ ]:
# orders_merge_items contains additional records, because each order can consist of multiple items

# display the count of orders by the number of products it contains
num_items = orders_merge_items['order_id'].value_counts().reset_index().rename(columns={'count': 'num_items_in_order'})
num_orders = num_items.groupby('num_items_in_order')['order_id'].count().reset_index().rename(columns={'order_id': 'num_orders'})

# calculate cumulative percentage
total_orders = num_orders['num_orders'].sum()
num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
num_orders
Out[ ]:
num_items_in_order num_orders cumulative_count cumulative_percentage
0 1 85373 85373 90.04
1 2 7242 92615 97.67
2 3 1289 93904 99.03
3 4 487 94391 99.55
4 5 188 94579 99.75
5 6 184 94763 99.94
6 7 21 94784 99.96
7 8 8 94792 99.97
8 9 3 94795 99.97
9 10 8 94803 99.98
10 11 4 94807 99.99
11 12 5 94812 99.99
12 13 1 94813 99.99
13 14 2 94815 99.99
14 15 2 94817 100.00
15 20 2 94819 100.00
16 21 1 94820 100.00
In [ ]:
# we can observe that 90% of orders are with a single item and 99% of orders are with 3 items or less
In [ ]:
# we can also look into an order and the number of different product sellers
num_sellers = orders_merge_items.groupby('order_id')['seller_id'].nunique().reset_index().rename(columns={'seller_id': 'num_sellers_in_order'})
num_orders = num_sellers.groupby('num_sellers_in_order').count().reset_index().rename(columns={'order_id': 'num_orders'})

# calculate cumulative percentage
total_orders = num_orders['num_orders'].sum()
num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
num_orders
Out[ ]:
num_sellers_in_order num_orders cumulative_count cumulative_percentage
0 1 93600 93600 98.71
1 2 1164 94764 99.94
2 3 53 94817 100.00
3 4 2 94819 100.00
4 5 1 94820 100.00
In [ ]:
# we can observe that almost 99% of orders are filled by a single seller (including multiple item orders)

Adding customer geolocation info to core orders table¶

In [ ]:
# merge customer geolocation data to orders_merge_items dataset
orders_merge_items_geoc = pd.merge(orders_merge_items, customers_geo[['customer_id', 'c_geolocation_lat', 'c_geolocation_lng']], on='customer_id', how='left')
In [ ]:
print('Dataset shape: ', orders_merge_items_geoc.shape)
Dataset shape:  (108270, 28)

Adding sellers geolocation info to core orders table¶

In [ ]:
# merge seller geolocation data to orders_merge_items dataset
orders_merge_items_geo = pd.merge(orders_merge_items_geoc, sellers_geo[['seller_id', 's_geolocation_lat', 's_geolocation_lng']], on='seller_id', how='left')

Remove any missing values for coordinates (for plotting)

In [ ]:
# check for any missing geolocation info
print(customers_geo.isna().sum())
customer_id                      0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
geolocation_zip_code_prefix    278
c_geolocation_lat              278
c_geolocation_lng              278
dtype: int64
In [ ]:
# both customers and sellers datasets contain nan values for geolocation
# it could be due to customer and sellers zip codes not included in the geolocation dataset
nan_zip_prefixes = customers_geo[customers_geo['c_geolocation_lat'].isna()]['customer_zip_code_prefix'].unique()
print('Missing geolocation info for these zip codes present in customers dataset: ', nan_zip_prefixes)
print('Number of zip codes missing: ', len(nan_zip_prefixes))
Missing geolocation info for these zip codes present in customers dataset:  [72300 11547 64605 72465  7729 72904 35408 78554 73369  8980 29949 65137
 28655 73255 28388  6930 71676 64047 61906 83210 71919 36956 35242 72005
 29718 41347 70324 70686 72341 12332 70716 71905 75784 73082 71884 71574
 72238 71996 76968 71975 72595 72017 72596 67105 25840 72002 72821 85118
 25919 95853 72583 68511 70701 71591 72535 95572 73090 72242 86135 70316
 73091 41098 58734 73310 71810 72280  7430 73081 70333 72268 35104 72455
 72237 17390 76897 84623 70702 72760 73088 29196 36596 57254 71995 73093
 75257 48504 83843 62625 37005 73401 49870 13307 28617 73402 56327 71976
 72587 85958 19740 77404 44135 28120 72863 87323 87511 72440 72243 65830
 71261 28575  2140 71551 72023 28160 55027 43870 94370 38710 42716 36248
 71593 71953 72549 72457 56485 71590 93602  7412  8342 39103 72536 59547
 59299 85894 36857 71993  7784 71539 42843 86996 73272 72867 27980 58286
 71208 71971 73391 64095 72427 38627 62898 71698 12770 72338 68629 28530
 55863]
Number of zip codes missing:  157
In [ ]:
# remove rows with NaN in latitude or longitude columns
orders_merge_items_geo_clean = orders_merge_items_geo.dropna(subset=['c_geolocation_lat', 'c_geolocation_lng', 's_geolocation_lat', 's_geolocation_lng'])

Aggregate order items (number and type of products, the price and freight value) to a core orders table

---> a unit of analysis is a particular order customer places

In [ ]:
# in orders_merge_items we have duplicate order_ids as orders are split up based on items
# we want to have a record for each order:
# output average delivery times, sum of price, sum of freight value, for geolocation we can take average (these values are the same for all order ids)
# we also want to preserve customer_id, seller_id (in case of customer we can take first and for seller the mode)

# create an aggregation function mapping based on the data types of the columns
agg_funcs = {}

# numeric columns: we can apply mean or sum
numeric_columns = orders_merge_items_geo_clean.select_dtypes(include=['float64', 'int64', 'timedelta64[ns]']).columns
for col in numeric_columns:
    if 'price' in col or 'freight_value' in col:  # Use sum for price-related fields
        agg_funcs[col] = 'sum'
    else:  # Use mean for other numeric fields
        agg_funcs[col] = 'mean'

# object columns: use mode (most frequent value)
object_columns = orders_merge_items_geo_clean.select_dtypes(include=['object']).columns
for col in object_columns:
    if col == 'order_id':  # We are grouping by order_id, so no need to aggregate it
        continue
    elif col == 'customer_id':  # Taking first customer_id (assuming one customer per order)
        agg_funcs[col] = 'first'
    elif col == 'seller_id' or col == 'product_id':  # Mode for seller_id and product_id
        agg_funcs[col] = lambda x: x.mode()[0]
    else:  # For other object fields, use the first entry (e.g., order_status)
        agg_funcs[col] = 'first'

# Datetime columns: take the first timestamp (chronologically)
datetime_columns = orders_merge_items_geo_clean.select_dtypes(include=['datetime64[ns]', 'period[M]']).columns
for col in datetime_columns:
    agg_funcs[col] = 'first'

# Apply the aggregation
orders_merged = orders_merge_items_geo_clean.groupby('order_id').agg(agg_funcs).reset_index()

# Display the first few rows of the result
orders_merged.head()
Out[ ]:
order_id delivery_time delivery_time_days order_processing_time carrier_delivery_time customer_delivery_time time_to_estimate_delivery order_processing_time_days carrier_delivery_time_days customer_delivery_time_days ... customer_id order_status product_id seller_id order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date year_month
0 00010242fe8c5a6d1ba2dd792cb16214 7 days 14:44:46 7.614421 0 days 00:46:33 6 days 08:48:41 1 days 05:09:32 -9 days +23:43:48 0.032326 6.367141 1.214954 ... 3ce436f183e68e07877b285a838db11a delivered 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 2017-09-13 08:59:02 2017-09-13 09:45:35 2017-09-19 18:34:16 2017-09-20 23:43:48 2017-09-29 2017-09
1 00018f77f2f0320c557190d7a144bdd3 16 days 05:11:18 16.216181 0 days 00:12:07 8 days 03:29:47 8 days 01:29:24 -3 days +16:04:24 0.008414 8.145683 8.062083 ... f6dd3ec061db4e3987629fe6b26e5cce delivered e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 2017-04-26 10:53:06 2017-04-26 11:05:13 2017-05-04 14:35:00 2017-05-12 16:04:24 2017-05-15 2017-04
2 000229ec398224ef6ca0657da4fc703e 7 days 22:45:45 7.948437 0 days 00:14:59 1 days 21:48:18 6 days 00:42:28 -14 days +13:19:16 0.010405 1.908542 6.029491 ... 6489ae5e4333f3693df5ad4372dab6d3 delivered c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 2018-01-14 14:33:31 2018-01-14 14:48:30 2018-01-16 12:36:48 2018-01-22 13:19:16 2018-02-05 2018-01
3 00024acbcdf0a6daa1e931b038114c75 6 days 03:32:04 6.147269 0 days 00:09:43 2 days 03:17:42 4 days 00:04:39 -6 days +13:32:39 0.006748 2.137292 4.003229 ... d4eb9395c8c0431ee92fce09860c5a06 delivered 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 2018-08-08 10:00:35 2018-08-08 10:10:18 2018-08-10 13:28:00 2018-08-14 13:32:39 2018-08-20 2018-08
4 00042b26cf59d7ce69dfabb4e55b4fd9 25 days 02:44:40 25.114352 0 days 00:12:22 11 days 19:35:56 13 days 06:56:22 -16 days +16:42:31 0.008588 11.816620 13.289144 ... 58dbd0b2d70206bf40e62cd34e84d795 delivered ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 2017-02-04 13:57:51 2017-02-04 14:10:13 2017-02-16 09:46:09 2017-03-01 16:42:31 2017-03-17 2017-02

5 rows × 30 columns

Adding relavant payments info (e.g. payment type, # of installments, payment value) to core orders table¶

In [ ]:
order_payments['payment_sequential'].value_counts().reset_index()
Out[ ]:
payment_sequential count
0 1 99360
1 2 3039
2 3 581
3 4 278
4 5 170
5 6 118
6 7 82
7 8 54
8 9 43
9 10 34
10 11 29
11 12 21
12 13 13
13 14 10
14 15 8
15 18 6
16 19 6
17 16 6
18 17 6
19 21 4
20 20 4
21 22 3
22 26 2
23 24 2
24 23 2
25 25 2
26 29 1
27 28 1
28 27 1
In [ ]:
# we need to group order_payments by order_id and sum the payment values
# for payment type we take the most common and for payment installments we take max

# group by 'order_id' and aggregate
order_payments_aggregated = order_payments.groupby('order_id').agg(
    total_payment_value=('payment_value', 'sum'),  # sum of payment values
    payment_type=('payment_type', lambda x: x.mode()[0]),  # most common payment type
    payment_installments=('payment_installments', 'max')  # max number of installments
).reset_index()

order_payments_aggregated.head()
Out[ ]:
order_id total_payment_value payment_type payment_installments
0 00010242fe8c5a6d1ba2dd792cb16214 72.19 credit_card 2
1 00018f77f2f0320c557190d7a144bdd3 259.83 credit_card 3
2 000229ec398224ef6ca0657da4fc703e 216.87 credit_card 5
3 00024acbcdf0a6daa1e931b038114c75 25.78 credit_card 2
4 00042b26cf59d7ce69dfabb4e55b4fd9 218.04 credit_card 3
In [ ]:
# for each order id we also want to add info from payments dataset
orders_merged = pd.merge(orders_merged, order_payments_aggregated[['order_id', 'payment_type', 'payment_installments', 'total_payment_value']], on='order_id', how='left')
orders_merged.head()
Out[ ]:
order_id delivery_time delivery_time_days order_processing_time carrier_delivery_time customer_delivery_time time_to_estimate_delivery order_processing_time_days carrier_delivery_time_days customer_delivery_time_days ... seller_id order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date year_month payment_type payment_installments total_payment_value
0 00010242fe8c5a6d1ba2dd792cb16214 7 days 14:44:46 7.614421 0 days 00:46:33 6 days 08:48:41 1 days 05:09:32 -9 days +23:43:48 0.032326 6.367141 1.214954 ... 48436dade18ac8b2bce089ec2a041202 2017-09-13 08:59:02 2017-09-13 09:45:35 2017-09-19 18:34:16 2017-09-20 23:43:48 2017-09-29 2017-09 credit_card 2 72.19
1 00018f77f2f0320c557190d7a144bdd3 16 days 05:11:18 16.216181 0 days 00:12:07 8 days 03:29:47 8 days 01:29:24 -3 days +16:04:24 0.008414 8.145683 8.062083 ... dd7ddc04e1b6c2c614352b383efe2d36 2017-04-26 10:53:06 2017-04-26 11:05:13 2017-05-04 14:35:00 2017-05-12 16:04:24 2017-05-15 2017-04 credit_card 3 259.83
2 000229ec398224ef6ca0657da4fc703e 7 days 22:45:45 7.948437 0 days 00:14:59 1 days 21:48:18 6 days 00:42:28 -14 days +13:19:16 0.010405 1.908542 6.029491 ... 5b51032eddd242adc84c38acab88f23d 2018-01-14 14:33:31 2018-01-14 14:48:30 2018-01-16 12:36:48 2018-01-22 13:19:16 2018-02-05 2018-01 credit_card 5 216.87
3 00024acbcdf0a6daa1e931b038114c75 6 days 03:32:04 6.147269 0 days 00:09:43 2 days 03:17:42 4 days 00:04:39 -6 days +13:32:39 0.006748 2.137292 4.003229 ... 9d7a1d34a5052409006425275ba1c2b4 2018-08-08 10:00:35 2018-08-08 10:10:18 2018-08-10 13:28:00 2018-08-14 13:32:39 2018-08-20 2018-08 credit_card 2 25.78
4 00042b26cf59d7ce69dfabb4e55b4fd9 25 days 02:44:40 25.114352 0 days 00:12:22 11 days 19:35:56 13 days 06:56:22 -16 days +16:42:31 0.008588 11.816620 13.289144 ... df560393f3a51e74553ab94004ba5c87 2017-02-04 13:57:51 2017-02-04 14:10:13 2017-02-16 09:46:09 2017-03-01 16:42:31 2017-03-17 2017-02 credit_card 3 218.04

5 rows × 33 columns

Adding product relavant info to core orders table¶

In [ ]:
orders_merged = pd.merge(orders_merged, products[['product_id', 'product_category_name', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']], on='product_id', how='left')
orders_merged['product_vol'] = orders_merged['product_length_cm'] * orders_merged['product_height_cm'] * orders_merged['product_width_cm']
orders_merged.head()
Out[ ]:
order_id delivery_time delivery_time_days order_processing_time carrier_delivery_time customer_delivery_time time_to_estimate_delivery order_processing_time_days carrier_delivery_time_days customer_delivery_time_days ... year_month payment_type payment_installments total_payment_value product_category_name product_weight_g product_length_cm product_height_cm product_width_cm product_vol
0 00010242fe8c5a6d1ba2dd792cb16214 7 days 14:44:46 7.614421 0 days 00:46:33 6 days 08:48:41 1 days 05:09:32 -9 days +23:43:48 0.032326 6.367141 1.214954 ... 2017-09 credit_card 2 72.19 cool_stuff 650.0 28.0 9.0 14.0 3528.0
1 00018f77f2f0320c557190d7a144bdd3 16 days 05:11:18 16.216181 0 days 00:12:07 8 days 03:29:47 8 days 01:29:24 -3 days +16:04:24 0.008414 8.145683 8.062083 ... 2017-04 credit_card 3 259.83 pet_shop 30000.0 50.0 30.0 40.0 60000.0
2 000229ec398224ef6ca0657da4fc703e 7 days 22:45:45 7.948437 0 days 00:14:59 1 days 21:48:18 6 days 00:42:28 -14 days +13:19:16 0.010405 1.908542 6.029491 ... 2018-01 credit_card 5 216.87 moveis_decoracao 3050.0 33.0 13.0 33.0 14157.0
3 00024acbcdf0a6daa1e931b038114c75 6 days 03:32:04 6.147269 0 days 00:09:43 2 days 03:17:42 4 days 00:04:39 -6 days +13:32:39 0.006748 2.137292 4.003229 ... 2018-08 credit_card 2 25.78 perfumaria 200.0 16.0 10.0 15.0 2400.0
4 00042b26cf59d7ce69dfabb4e55b4fd9 25 days 02:44:40 25.114352 0 days 00:12:22 11 days 19:35:56 13 days 06:56:22 -16 days +16:42:31 0.008588 11.816620 13.289144 ... 2017-02 credit_card 3 218.04 ferramentas_jardim 3750.0 35.0 40.0 30.0 42000.0

5 rows × 39 columns

Adding review score to core orders table¶

In [ ]:
# add the review score to see if reviews somehow correlate with delivery times/ delivery performance metrics
orders_merged = pd.merge(orders_merged, reviews[['order_id', 'review_score']], on='order_id', how='left')
orders_merged.head()
Out[ ]:
order_id delivery_time delivery_time_days order_processing_time carrier_delivery_time customer_delivery_time time_to_estimate_delivery order_processing_time_days carrier_delivery_time_days customer_delivery_time_days ... payment_type payment_installments total_payment_value product_category_name product_weight_g product_length_cm product_height_cm product_width_cm product_vol review_score
0 00010242fe8c5a6d1ba2dd792cb16214 7 days 14:44:46 7.614421 0 days 00:46:33 6 days 08:48:41 1 days 05:09:32 -9 days +23:43:48 0.032326 6.367141 1.214954 ... credit_card 2 72.19 cool_stuff 650.0 28.0 9.0 14.0 3528.0 5.0
1 00018f77f2f0320c557190d7a144bdd3 16 days 05:11:18 16.216181 0 days 00:12:07 8 days 03:29:47 8 days 01:29:24 -3 days +16:04:24 0.008414 8.145683 8.062083 ... credit_card 3 259.83 pet_shop 30000.0 50.0 30.0 40.0 60000.0 4.0
2 000229ec398224ef6ca0657da4fc703e 7 days 22:45:45 7.948437 0 days 00:14:59 1 days 21:48:18 6 days 00:42:28 -14 days +13:19:16 0.010405 1.908542 6.029491 ... credit_card 5 216.87 moveis_decoracao 3050.0 33.0 13.0 33.0 14157.0 5.0
3 00024acbcdf0a6daa1e931b038114c75 6 days 03:32:04 6.147269 0 days 00:09:43 2 days 03:17:42 4 days 00:04:39 -6 days +13:32:39 0.006748 2.137292 4.003229 ... credit_card 2 25.78 perfumaria 200.0 16.0 10.0 15.0 2400.0 4.0
4 00042b26cf59d7ce69dfabb4e55b4fd9 25 days 02:44:40 25.114352 0 days 00:12:22 11 days 19:35:56 13 days 06:56:22 -16 days +16:42:31 0.008588 11.816620 13.289144 ... credit_card 3 218.04 ferramentas_jardim 3750.0 35.0 40.0 30.0 42000.0 5.0

5 rows × 40 columns

Calculating derived attributes¶

Calculating other derived attributes¶


Preprocessing time

Carrier takeover time

Customer delivery time

Total delivery time

Time to estimate days

Late deliveries (%)


  1. Distance (km) (using Haversine formula)

  2. Distance efficiency (distance_km / delivery_time_days) ---> higher values indicate faster deliveries across distances

  3. Freight cost per km (freight_value / distance_km) ---> identifies how freight scales with distance

  4. Price to distance ratio (price / distance_km) ---> evaluate if higher-priced products tend to be delivered over longer or shorter distances

  5. Cost efficiency (freight_value / delivery_time_days) ---> how efficiently freight costs are being used relative to the delivery speed

  6. Order size per km (price / distance_km) ---> whether higher priced products tend to have longer delivery

  7. Total order cost per km (total_payment_value / distance_km)

  8. Freight share (freight_value / total_payment_value)


  1. Preprocessing time % (order_processing_time_days / delivery_time_days)

  2. Carrier takeover time % (carrier_delivery_time_days / delivery_time_days)

  3. Customer delivery time % (customer_delivery_time_days / delivery_time_days)


In [ ]:
# @title Calculations
# function to calculate distance using Haversine formula

import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c  # Distance in kilometers

# calculate distances for each order
orders_merged['distance_km'] = haversine(
    orders_merged['c_geolocation_lat'],
    orders_merged['c_geolocation_lng'],
    orders_merged['s_geolocation_lat'],
    orders_merged['s_geolocation_lng']
)

# other metrics
# 1) distance_efficiency = distance_km / delivery_time_days ---> higher values indicate faster deliveries across distances
orders_merged['distance_efficiency'] = orders_merged['distance_km'] / orders_merged['delivery_time_days']

# 2) freight_cost_per_km = total_freight_value / distance_km ---> identifies how freight cost scales with distance (can identify costly deliveries)
orders_merged['freight_cost_per_km'] = orders_merged['freight_value'] / orders_merged['distance_km']

# 3) price_to_distance = total_price / distance_km ---> evaluate if higher-priced products tend to be delivered over longer or shorter distances
orders_merged['price_to_distance'] = orders_merged['price'] / orders_merged['distance_km']

# 4) cost_efficiency = total_freight_value / delivery_time_days --> how efficiently freight costs are being used relative to the delivery speed
orders_merged['cost_efficiency'] = orders_merged['freight_value'] / orders_merged['delivery_time_days']

# 5) order_size_per_km = total_price / distance_km ---> whether higher priced products tend to have longer delivery
orders_merged['order_size_per_km'] = orders_merged['price'] / orders_merged['distance_km']

# 6) total_order_cost_per_km = total_payment_value / distance_km
orders_merged['total_order_cost_per_km'] = orders_merged['total_payment_value'] / orders_merged['distance_km']

# 7) the share of freight cost in total order value
orders_merged['freight_share'] = orders_merged['freight_value'] / orders_merged['total_payment_value']

# 8) late_delivery (1 for late and 0 for on-time deliveries)
orders_merged['late_delivery'] = orders_merged['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)

Removal of outliers¶

In [ ]:
# @title Box Plots of Delivery Times
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

# Set global font and style for the plot
mpl.rcParams.update({
    "font.family": "serif",
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

# Define the relevant columns for outlier detection
columns_to_check = [
    'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
    'customer_delivery_time_days', 'time_to_estimate_delivery_days', 'distance_km'
]

# Set up the figure for multiple subplots
plt.figure(figsize=(15, 10))
viridis_colors = plt.cm.viridis(np.linspace(0, 1, len(columns_to_check)))

# Create individual box plots for each column
for i, col in enumerate(columns_to_check, 1):
    plt.subplot(3, (len(columns_to_check) + 2) // 3, i)
    sns.boxplot(data=orders_merged[col], orient='v', color=viridis_colors[i-1])  # Use Viridis color

    # Set title for each subplot
    plt.title(col, fontsize=12)

    # Customize spines for each subplot
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# @title Removing outliers
# Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
    # Create a copy to avoid modifying the original dataframe
    df_clean = df.copy()

    # Loop through each selected column to remove outliers
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)  # 1st quartile (25th percentile)
        Q3 = df_clean[col].quantile(0.75)  # 3rd quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile Range

        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out the outliers
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    return df_clean

# apply the outlier removal function to the selected columns
orders_cleaned = remove_outliers_iqr(orders_merged, columns_to_check)

# display the cleaned dataframe
print("Dataset shape after:", orders_cleaned.shape)
Dataset shape after: (68055, 49)

Inspecting derived attributes¶

  • some of the metrics can contain +inf or -inf values (e.g., price to distance) is close to 0 km (perhaps 400m)

---> therefore, we need to trim these values

  • we can use +/- 3 std as a replacament value
In [ ]:
import numpy as np

def replace_inf_with_thresholds(df, metrics):
    """
    Replace +inf with (mean + 3*std) and -inf with (mean - 3*std) for the specified metrics in the DataFrame.
    """
    df_copy = df.copy()  # Work on a copy of the DataFrame

    for metric in metrics:
        # Create a temporary copy of the metric column and replace inf values with NaN
        temp_metric = df_copy[metric].copy()
        temp_metric.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Calculate mean and std, excluding NaN values
        mean_value = temp_metric.mean()
        std_value = temp_metric.std()

        # Replace +inf values with mean + 3 * std
        df_copy[metric] = np.where(
            df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric]
        )
        # Replace -inf values with mean - 3 * std
        df_copy[metric] = np.where(
            df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric]
        )

    return df_copy

# List of metrics to clean from inf values
metrics_to_clean = [
    'freight_cost_per_km', 'price_to_distance', 'cost_efficiency',
    'order_size_per_km', 'total_order_cost_per_km', 'freight_share', 'review_score'
]

# Apply the function to the dataframe with the listed metrics
orders_cleaned = replace_inf_with_thresholds(orders_cleaned, metrics_to_clean)

Exploratory Data Analysis¶

Correlation matrix¶

In [ ]:
# @title Correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Define the columns for the correlation matrix
cols = [
    'processing_share', 'carrier_share', 'customer_share', 'price',
    'freight_value', 'payment_installments', 'total_payment_value',
    'distance_km', 'distance_efficiency', 'freight_cost_per_km',
    'price_to_distance', 'cost_efficiency', 'order_size_per_km',
    'total_order_cost_per_km', 'freight_share', 'product_weight_g',
    'product_length_cm', 'product_height_cm', 'product_width_cm',
    'product_vol', 'review_score', 'delivery_time_days', 'order_processing_time_days',
    'carrier_delivery_time_days', 'customer_delivery_time_days', 'time_to_estimate_delivery_days'
]

# Calculate the correlation matrix
corr_matrix = orders_cleaned[cols].corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Further increase the size of the heatmap and format annotations
plt.figure(figsize=(30, 18))  # Larger figure size for better readability

# Create the heatmap object with Viridis color palette, limited decimal places, and increased font size
heatmap = sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",  # Limit annotations to two decimal places
    annot_kws={"size": 10},  # Further increase annotation font size
    cmap="viridis",  # Set the colormap to Viridis
    mask=mask,
    square=True,
    cbar_kws={"shrink": .75}
)

# Set the title of the heatmap
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize': 16}, pad=16)

# Show the plot
plt.show()
No description has been provided for this image
In [ ]:
# we can observe that product price is positively correlated with distance in km (0.32) and distance efficiency (0.27) ---> higher priced products tend to be delivered faster
# negatively correalted with freight cost per km (-0.33) ---> freight cost scales well with distance for higher priced products (well optimised) --> freight cost contribute less to total cost
# review score is slightly positively correlated with distance efficiency (0.09) and cost efficiency (0.08)
# and negatively correlated with customer delivery time and time to estimate delivery

Late vs. On-time Deliveries¶

In [ ]:
# @title Key Performance Metrics for Late vs. On-Time Deliveries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the color palette to Viridis for better visibility
colors = sns.color_palette("viridis", 2)  # Using two shades from the Viridis palette

# Create a copy of the DataFrame to ensure we're working on a clean dataset
df_copy = orders_cleaned.copy()

# Create a binary column for late deliveries
df_copy['late_delivery'] = df_copy['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)

# Step 1: Get the top 10 product categories based on total delivery counts
top_10_categories = df_copy['product_category_name'].value_counts().nlargest(10).index

# List of metrics to plot
metrics = [
    'total_payment_value', 'price', 'freight_value', 'distance_km',
    'review_score', 'distance_efficiency', 'freight_cost_per_km',
    'price_to_distance', 'cost_efficiency', 'order_size_per_km',
    'total_order_cost_per_km', 'freight_share'
]

# Category mapping for English translation
category_mapping = {
    'cama_mesa_banho': 'Bedding, Table, and Bath',
    'beleza_saude': 'Beauty and Health',
    'esporte_lazer': 'Sports and Leisure',
    'utilidades_domesticas': 'Household Utilities',
    'moveis_decoracao': 'Furniture and Decoration',
    'informatica_acessorios': 'Computers and Accessories',
    'relogios_presentes': 'Watches and Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Step 2: Handle infinite values before plotting
for metric in metrics:
    # Calculate mean and standard deviation for the metric
    mean_value = df_copy[metric].mean()
    std_value = df_copy[metric].std()

    # Replace inf values with NaN
    df_copy[metric] = df_copy[metric].replace([np.inf, -np.inf], np.nan)

    # Replace -inf with -3 * std and +inf with +3 * std
    df_copy[metric] = np.where(df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric])
    df_copy[metric] = np.where(df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric])

# Step 3: Create subplots for each metric
fig, axs = plt.subplots(nrows=len(metrics), ncols=1, figsize=(14, 6 * len(metrics)))

# Iterate over each metric
for i, metric in enumerate(metrics):
    # Group by product_category_name and late deliveries to calculate mean for the current metric
    mean_values = df_copy.groupby(['product_category_name', 'late_delivery'])[metric].mean().reset_index()

    # Filter the mean values for only the top 10 categories
    mean_values = mean_values[mean_values['product_category_name'].isin(top_10_categories)]

    # Reshape the DataFrame for easier plotting
    mean_values_long = mean_values.pivot(index='product_category_name',
                                         columns='late_delivery',
                                         values=metric).reset_index()

    # Rename columns for clarity
    mean_values_long.columns = ['product_category_name', 'On Time', 'Late']

    # Map the product category names to English
    mean_values_long['product_category_name'] = mean_values_long['product_category_name'].map(category_mapping)

    # Create the bar plot
    mean_values_long.set_index('product_category_name').plot(kind='bar', ax=axs[i], color=colors)

    # Adding titles and labels
    axs[i].set_title(f'Mean {metric.replace("_", " ").title()} for Top 10 Product Categories')
    axs[i].set_xlabel('Product Categories')
    axs[i].set_ylabel(f'Mean {metric.replace("_", " ").title()}')
    axs[i].set_xticks(range(len(mean_values_long['product_category_name'])))
    axs[i].set_xticklabels(mean_values_long['product_category_name'], rotation=45)
    axs[i].legend(title='Delivery Status', labels=['On Time', 'Late'])

    # Remove the top, left, and right spines
    axs[i].spines['top'].set_visible(False)
    axs[i].spines['left'].set_visible(False)
    axs[i].spines['right'].set_visible(False)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image

Largest Cities Analysis¶

In [ ]:
# here we could produce more aggregated stats and explore differences across cities, product categories, sellers

# 1) groupby city (see average delivery time, late deliveries %, total payment value, freight share)
In [ ]:
# @title Assigning city labels based on geographic coordinates
# add a city label based on predefined centered city coordinates
import pandas as pd
from geopy.distance import geodesic

# Define cities coordinates (lat, lng)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    #'Fortaleza': (-3.7172, -38.5434),
    'Salvador': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Function to determine city label based on coordinates
def label_city(row):
    for city, coords in cities_coords.items():
        if geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), coords).km <= 50:  # 50 km radius
            return city
    return None  # Return None if no city is found within the radius

# Create a new column in orders_cleaned for city labels
orders_cleaned['city'] = orders_cleaned.apply(label_city, axis=1)
In [ ]:
# @title Delivery Times by Largest Cities
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Define metrics for aggregation
metrics = ['order_id', 'delivery_time_days', 'late_delivery', 'total_payment_value', 'freight_share']

# Group by city and calculate the mean for each metric
grouped = orders_cleaned.groupby('city')[metrics].agg(
    {'order_id': 'count', 'delivery_time_days': 'mean', 'late_delivery': 'mean', 'total_payment_value': 'mean', 'freight_share': 'mean'}
).reset_index().rename(columns={'order_id': 'count'})

# Set up plot style and font properties
plt.rcParams.update({
    "font.family": "serif",
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})
# Convert the color palette to a list and limit the colors to the number of cities
viridis_colors = list(plt.cm.viridis(np.linspace(0, 1, len(grouped['city'].unique()))))

# Create four bar subplots with custom formatting
plt.figure(figsize=(14, 10))

for i, metric in enumerate(metrics[1:]):
    ax = plt.subplot(2, 2, i+1)
    sns.barplot(data=grouped, x='city', y=metric, hue="city", palette=viridis_colors[:len(grouped['city'].unique())], legend=False)
    ax.set_title(metric.replace('_', ' ').capitalize())
    ax.set_xlabel('City')
    ax.set_ylabel(metric.replace('_', ' ').capitalize())
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# 2) in rows we have cities and in columns we have product categories
  # the matrix can display:
  # the count of orders ---> this can tell whether certain product categories are ordered more or less often from remote areas
  # average delivery time (in days) ---> whether delivery time differs for certain cities/ products
  # delivery delay % ---> the same
In [ ]:
# @title Key Sales Metric Comparison Across Cities - Heatmaps
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'orders_cleaned' is already labeled with cities

# Step 1: Group by city and product category, including additional metrics
grouped = orders_cleaned.groupby(['city', 'product_category_name']).agg(
    order_count=('order_id', 'count'),
    avg_delivery_time=('delivery_time_days', 'mean'),
    late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
    freight_cost_per_km=('freight_cost_per_km', 'mean'),
    price_to_distance=('price_to_distance', 'mean'),
    cost_efficiency=('cost_efficiency', 'mean'),
    order_size_per_km=('order_size_per_km', 'mean'),
    total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
    freight_share=('freight_share', 'mean'),
    review_score=('review_score', 'mean')
).reset_index()

# Step 2: Calculate delivery delay percentage
grouped['delivery_delay_percentage'] = (grouped['late_delivery_count'] / grouped['order_count']) * 100

# Step 3: Get top 10 product categories based on order count
top_10_product_categories = grouped.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()

# Filter grouped data for top 10 product categories
grouped_top_10 = grouped[grouped['product_category_name'].isin(top_10_product_categories)]

# Step 5: Create a mapping for product categories
category_mapping = {
    'automotivo': 'Automotive',
    'beleza_saude': 'Beauty & Health',
    'brinquedos': 'Toys',
    'cama_mesa_banho': 'Bedding, Table & Bath',
    'esporte_lazer': 'Sports & Leisure',
    'informatica_acessorios': 'Computers & Accessories',
    'moveis_decoracao': 'Furniture & Decoration',
    'relogios_presentes': 'Watches & Gifts',
    'telefonia': 'Telephony',
    'utilidades_domesticas': 'Household Items'
}

# Step 6: Metrics and formatting information
metrics = {
    'order_count': ('Order Count', 0),
    'avg_delivery_time': ('Average Delivery Time (Days)', 2),
    'delivery_delay_percentage': ('Delivery Delay Percentage (%)', 2),
    'freight_cost_per_km': ('Freight Cost per Km', 2),
    'price_to_distance': ('Price to Distance', 2),
    'cost_efficiency': ('Cost Efficiency', 2),
    'order_size_per_km': ('Order Size per Km', 2),
    'total_order_cost_per_km': ('Total Order Cost per Km', 2),
    'freight_share': ('Freight Share', 2),
    'review_score': ('Review Score', 2)
}

# Define color map notes for matrix-wide and column-wise scaling
matrix_note = "*Color map applied for whole matrix, comparing all cities and product categories"
column_note = "*Color map applied column-wise, comparing cities per product category"

# Step 7: Loop through each metric, create pivot tables, and display as heatmaps with appropriate scaling
for i, (metric, (title, decimal_places)) in enumerate(metrics.items()):
    # Create a pivot table
    pivot_table = grouped_top_10.pivot(index='city', columns='product_category_name', values=metric).fillna(0)
    pivot_table.rename(columns=category_mapping, inplace=True)
    pivot_table.columns.name = None  # Flatten the MultiIndex

    # Decide on normalization type and set appropriate note
    if i < 2:  # For the first two metrics, normalize across the entire matrix
        normed_pivot = (pivot_table - pivot_table.values.min()) / (pivot_table.values.max() - pivot_table.values.min())
        color_map_note = matrix_note
    else:  # For other metrics, normalize each column (product category) independently for column-wise coloring
        normed_pivot = pivot_table.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
        color_map_note = column_note

    # Plot the heatmap with normalized values and original values as annotations
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(
        normed_pivot,
        cmap='viridis',
        annot=pivot_table.round(decimal_places),  # Use original values for annotations
        fmt=f".{decimal_places}f",
        cbar_kws={'label': title}
    )
    plt.title(f"{title} by City and Product Category")
    plt.xlabel("Product Category")
    plt.ylabel("City")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Adjust the colorbar position
    plt.subplots_adjust(right=0.85)
    colorbar = ax.collections[0].colorbar

    # Place the color map note slightly to the right and center it vertically with the color bar
    colorbar.ax.text(
        3.0,  # Adjust to move right
        0.5,  # Center vertically
        color_map_note,
        fontsize=10,
        color='black',
        ha='left',
        va='center',  # Center alignment vertically
        transform=colorbar.ax.transAxes
    )

    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# 3) in rows 10-20 largest sellers and in columns: num_orders, average order size (payment_value), average delivery time, average freight share
In [ ]:
# @title Top Sellers with Key Metrics - Heatmap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'orders_cleaned' includes all necessary seller-level data

# Step 1: Group by seller and aggregate metrics
grouped_sellers = orders_cleaned.groupby('seller_id').agg(
    order_count=('order_id', 'count'),
    avg_order_size=('total_payment_value', 'mean'),
    avg_delivery_time=('delivery_time_days', 'mean'),
    avg_freight_share=('freight_share', 'mean'),
    freight_cost_per_km=('freight_cost_per_km', 'mean'),
    price_to_distance=('price_to_distance', 'mean'),
    cost_efficiency=('cost_efficiency', 'mean'),
    order_size_per_km=('order_size_per_km', 'mean'),
    total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
    late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
    review_score=('review_score', 'mean')
).reset_index()

# Step 2: Calculate delivery delay percentage
grouped_sellers['delivery_delay_percentage'] = (grouped_sellers['late_delivery_count'] / grouped_sellers['order_count']) * 100

# Step 3: Sort sellers by order count and select top 30 sellers
top_sellers = grouped_sellers.sort_values(by='order_count', ascending=False).head(30)

# Step 4: Replace seller_id with rank based on order count
top_sellers['Rank'] = range(1, len(top_sellers) + 1)
top_sellers.set_index('Rank', inplace=True)

# Step 5: Select metrics for heatmap
metrics = [
    'order_count', 'avg_order_size', 'avg_delivery_time', 'avg_freight_share',
    'freight_cost_per_km', 'price_to_distance', 'cost_efficiency', 'order_size_per_km',
    'total_order_cost_per_km', 'delivery_delay_percentage', 'review_score'
]
pivot_data = top_sellers[metrics]

# Define which metrics are "negative" where higher values are not preferred
negative_metrics = ['avg_delivery_time', 'avg_freight_share', 'freight_cost_per_km',
                    'price_to_distance', 'total_order_cost_per_km', 'delivery_delay_percentage']

# Step 6: Normalize each column for per-column color scaling, inverting where higher values are not beneficial
normed_data = pivot_data.apply(lambda x: (x.max() - x) / (x.max() - x.min()) if x.name in negative_metrics
                               else (x - x.min()) / (x.max() - x.min()), axis=0)

# Step 7: Format data for annotation
formatted_data = pivot_data.copy()
formatted_data['order_count'] = pivot_data['order_count'].astype(int)  # Format as integer for order_count
formatted_data = formatted_data.apply(lambda x: x.map('{:.2f}'.format) if x.name != 'order_count' else x.map('{:.0f}'.format))

# Step 8: Plot the heatmap with per-column scaling
plt.figure(figsize=(12, 8))
sns.heatmap(
    normed_data,
    cmap='viridis',
    annot=formatted_data,  # Display formatted values
    fmt="",  # Disable scientific notation
    cbar_kws={'label': 'Normalized Scale (Per Column)'}
)

# Add title and labels
plt.title("Top 30 Sellers with Key Metrics - Heatmap")
plt.xlabel("Metric")
plt.ylabel("Largest Sellers (Rank)")

# Customize tick label rotation
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# @title Delivery Performance Metrics by Product Category by Late and On-Time Deliveries - Heatmaps
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Aggregate by product category and late delivery (with 'order_count' as sum)
aggregated_data = orders_cleaned.groupby(['product_category_name', 'late_delivery']).agg(
    order_count=('order_id', 'sum'),  # sum for order_count
    avg_order_size=('total_payment_value', 'mean'),
    avg_delivery_time=('delivery_time_days', 'mean'),
    avg_freight_share=('freight_share', 'mean'),
    freight_cost_per_km=('freight_cost_per_km', 'mean'),
    price_to_distance=('price_to_distance', 'mean'),
    cost_efficiency=('cost_efficiency', 'mean'),
    order_size_per_km=('order_size_per_km', 'mean'),
    total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
    review_score=('review_score', 'mean')
).reset_index()

# Step 2: Calculate the top 10 categories by total order count and filter the data
top_10_categories = aggregated_data.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()
filtered_data = aggregated_data[aggregated_data['product_category_name'].isin(top_10_categories)]

# Step 3: Pivot the DataFrame with delivery status as the second level of columns
pivoted_data = filtered_data.pivot_table(
    index='product_category_name',
    columns='late_delivery',
    values=[
        'avg_order_size',
        'avg_delivery_time',
        'avg_freight_share',
        'freight_cost_per_km',
        'price_to_distance',
        'cost_efficiency',
        'order_size_per_km',
        'total_order_cost_per_km',
        'review_score'
    ],
    aggfunc='mean'
)

# Step 4: Rename columns for clarity, avoiding `KeyError`
pivoted_data.columns = pd.MultiIndex.from_tuples(
    [(metric, 'On-Time' if late_deliv == 0 else 'Late') for metric, late_deliv in pivoted_data.columns],
    names=["Metric", "Delivery Status"]
)

# Step 5: Reset index to bring 'product_category_name' into the DataFrame
pivoted_data.reset_index(inplace=True)

# Step 6: Translate product category names to English
translation_mapping = {
    'cama_mesa_banho': 'Bedding and Bath',
    'beleza_saude': 'Beauty and Health',
    'esporte_lazer': 'Sports and Leisure',
    'utilidades_domesticas': 'Household Utilities',
    'informatica_acessorios': 'Computers and Accessories',
    'moveis_decoracao': 'Furniture and Decoration',
    'relogios_presentes': 'Watches and Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}
pivoted_data['product_category_name'] = pivoted_data['product_category_name'].map(translation_mapping).fillna(pivoted_data['product_category_name'])

# Step 7: Order product categories by total order count
total_order_counts = aggregated_data.groupby('product_category_name')['order_count'].sum()
sorted_categories = total_order_counts[top_10_categories].sort_values(ascending=False).index
pivoted_data['product_category_name'] = pd.Categorical(
    pivoted_data['product_category_name'], categories=sorted_categories, ordered=True
)
pivoted_data = pivoted_data.sort_values('product_category_name')

# Step 8: Reshape data to prepare for heatmap
final_table = pivoted_data.set_index('product_category_name').stack(level=0, future_stack=True).unstack(level=1)

# Step 9: Separate data for On-Time and Late deliveries
on_time_data = final_table.xs('On-Time', level='Delivery Status', axis=1)
late_data = final_table.xs('Late', level='Delivery Status', axis=1)

# Step 10: Normalize each column independently for column-wise color scaling
on_time_normalized = on_time_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
late_normalized = late_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)

# Step 11: Plot heatmaps for On-Time and Late deliveries side by side with column-wise normalization
plt.figure(figsize=(15, 10))
sns.heatmap(on_time_normalized, cmap='viridis', annot=on_time_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - On-Time Deliveries")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(15, 10))
sns.heatmap(late_normalized, cmap='viridis', annot=late_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - Late Deliveries")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
No description has been provided for this image

Exploration of the largest cities¶

Rio de Janeiro, Sao Paulo, Porto Alegre, Curitiba, Fortaleza, Bahia, Brasilia

Customer Geolocation¶

Total Delivery Time¶

In [ ]:
# Exploring delivery times based on customer location is essential for several reasons:
#
# 1. Logistics Optimization: Delivery times vary due to geographic factors (distance, traffic, infrastructure).
#    By analyzing regional patterns, companies can identify areas where delays occur and optimize routing,
#    adjust warehouse placement, or streamline processes for better efficiency.
#
# 2. Customer Experience: Understanding location-based delivery performance enables companies to set more
#    accurate delivery expectations, improving customer satisfaction and reducing complaints about late deliveries.
#
# 3. Cost Management: By pinpointing regions with frequent delays, companies can assess whether the delivery
#    costs are proportionate to customer locations and adjust pricing, warehousing, or transportation strategies.

All customers¶

In [ ]:
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    'Fortaleza': (-3.7172, -38.5434),
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = orders_cleaned[
        orders_cleaned.apply(
            lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Calculate global min and max delivery times across all cities for consistent color mapping
all_delivery_times = []
for coords in cities_coords.values():
    city_data = filter_city_data(coords)
    all_delivery_times.extend(city_data['delivery_time_days'].dropna())

# Get the global min and max for color mapping
global_min = min(all_delivery_times)
global_max = max(all_delivery_times)

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Use all data for the city
    if len(city_data) > 0:  # Ensure there's data to plot
        # Create a scatterplot showing delivery times with inverted Viridis colormap
        scatter = sns.scatterplot(
            x=city_data['c_geolocation_lng'],
            y=city_data['c_geolocation_lat'],
            hue=city_data['delivery_time_days'],  # Use delivery time for color
            palette='viridis_r',  # Inverted Viridis for delivery time (lower is better)
            size=city_data['total_payment_value'],  # Size points by payment value
            sizes=(20, 200),  # Adjust sizes to improve visibility
            ax=ax,
            legend=False  # Disable legend for individual plots
        )

        ax.set_title(f"Delivery Performance in {city}")
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')

        # Remove the top, left, and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)

# Create a colorbar based on the delivery times
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time (Days)')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4)  # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
No description has been provided for this image

Distance Efficiency (distance in km per delivery time in days)¶

---> higher values indicate faster deliveries across distances

In [ ]:
# Analyzing distance efficiency based on customer location is important because:
#
# 1. Operational Optimization: Distance efficiency reveals how effectively routes are being utilized
#    relative to delivery times. It helps identify inefficiencies in delivery networks and provides
#    insights into how to optimize routes and resources for improved service.
#
# 2. Geographic Challenges: Distance efficiency can highlight areas with challenging infrastructure
#    or topography that may cause inefficiencies. Understanding these factors helps adjust strategies
#    for specific regions, such as choosing better-suited transportation methods or adding local warehouses.
#
# 3. Cost Efficiency: Tracking distance efficiency helps identify where excessive fuel or resources
#    may be used due to poor routing or geographic challenges, leading to potential cost reductions in
#    logistics and fuel consumption.
In [ ]:
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    'Fortaleza': (-3.7172, -38.5434),
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = orders_cleaned[
        orders_cleaned.apply(
            lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Calculate global min and max distance efficiency across all cities for consistent color mapping
all_distance_efficiency = []
for coords in cities_coords.values():
    city_data = filter_city_data(coords)
    all_distance_efficiency.extend(city_data['distance_efficiency'].dropna())

# Get the global min and max for color mapping
global_min = min(all_distance_efficiency)
global_max = max(all_distance_efficiency)

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Use all data for the city
    if len(city_data) > 0:  # Ensure there's data to plot
        # Create a scatterplot showing distance efficiency with Viridis colormap
        scatter = sns.scatterplot(
            x=city_data['c_geolocation_lng'],
            y=city_data['c_geolocation_lat'],
            hue=city_data['distance_efficiency'],  # Use distance efficiency for color
            palette='viridis',  # Viridis colormap (not inverted)
            size=city_data['total_payment_value'],  # Size points by payment value
            sizes=(20, 200),  # Adjust sizes to improve visibility
            ax=ax,
            legend=False  # Disable legend for individual plots
        )

        ax.set_title(f"Delivery Performance in {city}")
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')

        # Remove the top, left, and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)

# Create a colorbar based on the distance efficiency
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Distance Efficiency')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4)  # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities\nDot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
No description has been provided for this image

Customer reviews¶

In [ ]:
# Analyzing customer review scores based on location provides critical insights into the overall service quality:
#
# 1. Service Quality by Region: Customer satisfaction can vary across different regions due to factors such
#    as delivery speed, product availability, or local preferences. Identifying areas with lower review scores
#    helps target improvements in those regions.
#
# 2. Logistics Impact: Geographical issues, such as infrastructure or delivery delays, can negatively
#    impact customer experiences and reviews. Monitoring scores regionally helps identify if logistics
#    challenges are influencing customer satisfaction.
#
# 3. Targeted Improvements: By understanding the relationship between location and review scores,
#    companies can implement targeted actions, such as improving last-mile delivery services, offering
#    local support, or addressing specific regional complaints to boost customer experience.
In [ ]:
# @title Customer Reviews in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    'Fortaleza': (-3.7172, -38.5434),
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = orders_cleaned[
        orders_cleaned.apply(
            lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Calculate global min and max review scores across all cities for consistent color mapping
all_review_scores = []
for coords in cities_coords.values():
    city_data = filter_city_data(coords)
    all_review_scores.extend(city_data['review_score'].dropna())

# Get the global min and max for color mapping
global_min = min(all_review_scores)
global_max = max(all_review_scores)

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Use all data for the city
    if len(city_data) > 0:  # Ensure there's data to plot
        # Create a scatterplot showing customer review scores with Viridis colormap
        scatter = sns.scatterplot(
            x=city_data['c_geolocation_lng'],
            y=city_data['c_geolocation_lat'],
            hue=city_data['review_score'],  # Use review scores for color
            palette='viridis',  # Viridis colormap (not inverted)
            size=city_data['total_payment_value'],  # Size points by payment value
            sizes=(20, 200),  # Adjust sizes to improve visibility
            ax=ax,
            legend=False  # Disable legend for individual plots
        )

        ax.set_title(f"Customer Review Scores in {city}")
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')

        # Remove the top, left, and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)

# Create a colorbar based on the review scores
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Review Score')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4)  # Adjust space between subplots
plt.suptitle('Customer Reviews in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
No description has been provided for this image

Time To Estimate Delivery Days¶

In [ ]:
# Analyzing "Time to Estimate Delivery Days" based on customer location is important because:
#
# 1. Accuracy in Delivery Predictions: Understanding how well delivery estimates align with actual delivery
#    times across different regions allows businesses to improve their prediction models. This ensures customers
#    have more accurate expectations for their orders.
#
# 2. Regional Variations: Certain cities or regions may consistently show longer or shorter time estimates due to
#    local conditions, infrastructure, or logistical challenges. Identifying these patterns helps adjust
#    estimations to be region-specific.
#
# 3. Customer Satisfaction: Providing more accurate delivery time estimates leads to higher customer satisfaction.
#    Misalignments between estimated and actual delivery times can result in dissatisfaction, complaints, or negative reviews.
In [ ]:
# @title Time To Estimate Delivery Days in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    'Fortaleza': (-3.7172, -38.5434),
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = orders_cleaned[
        orders_cleaned.apply(
            lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Calculate global min and max time to estimate delivery days across all cities for consistent color mapping
all_time_to_estimate = []
for coords in cities_coords.values():
    city_data = filter_city_data(coords)
    all_time_to_estimate.extend(city_data['time_to_estimate_delivery_days'].dropna())

# Get the global min and max for color mapping
global_min = min(all_time_to_estimate)
global_max = max(all_time_to_estimate)

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Use all data for the city
    if len(city_data) > 0:  # Ensure there's data to plot
        # Create a scatterplot showing time to estimate delivery days with inverted Viridis colormap
        scatter = sns.scatterplot(
            x=city_data['c_geolocation_lng'],
            y=city_data['c_geolocation_lat'],
            hue=city_data['time_to_estimate_delivery_days'],  # Use delivery time for color
            palette='viridis_r',  # Inverted Viridis colormap (lower times are better)
            size=city_data['total_payment_value'],  # Size points by payment value
            sizes=(20, 200),  # Adjust sizes to improve visibility
            ax=ax,
            legend=False  # Disable legend for individual plots
        )

        ax.set_title(f"Time To Estimate Delivery Days in {city}")
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')

        # Remove the top, left, and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)

# Create a colorbar based on time to estimate delivery days
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time To Estimate Delivery Days')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4)  # Adjust space between subplots
plt.suptitle('Time To Estimate Delivery Days in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
No description has been provided for this image

Sellers Geolocation¶

Sellers Geolocation¶

In [ ]:
# Plotting delivery time days based on seller location is essential for understanding the geographical
# impact of delivery performance:
#
# 1. Seller-Based Logistics Insights: By visualizing delivery times relative to the seller's location,
#    companies can assess the efficiency of their sellers. This enables identification of specific sellers
#    or regions with longer delivery times, allowing for targeted improvements in supply chain or logistics
#    management.
#
# 2. Operational Bottlenecks: Regions where sellers consistently have higher delivery times may indicate
#    logistical bottlenecks, such as poor infrastructure or longer routes. Addressing these issues can reduce
#    delivery delays and improve customer satisfaction.
#
# 3. Seller Performance Monitoring: Sellers with higher volumes of orders may have different delivery
#    time patterns compared to those handling fewer orders. Monitoring the relationship between delivery
#    times and the number of orders allows businesses to support high-volume sellers with better logistical
#    planning and resources.

Delivery time days¶

In [ ]:
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    # 'Fortaleza': (-3.7172, -38.5434),  # Commented out for layout space
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = sellers_grouped[
        sellers_grouped.apply(
            lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Calculate delivery delays
    city_data['delivery_delay'] = city_data['delivery_time_days'] - city_data['time_to_estimate_delivery_days']

    # Create a scatterplot showing delivery delays with inverted Viridis colormap
    scatter = sns.scatterplot(
        x=city_data['s_geolocation_lng'],
        y=city_data['s_geolocation_lat'],
        hue=city_data['delivery_time_days'],  # Use delivery time for color
        palette='viridis_r',  # Inverted Viridis (lower delivery times are better)
        size=city_data['num_orders'],  # Size of points based on number of orders
        sizes=(20, 200),  # Adjust sizes to improve visibility
        ax=ax,
        legend=False  # Disable individual legend for the scatter plot
    )

    ax.set_title(f"Delivery Performance in {city}")
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    # Remove the top, left, and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Create a colorbar for delivery_time_days
norm = plt.Normalize(vmin=sellers_grouped['delivery_time_days'].min(), vmax=sellers_grouped['delivery_time_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)  # Use inverted Viridis colormap
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time Days')

# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # Adjust top to give space for the main title
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
No description has been provided for this image

Distance Efficiency¶

In [ ]:
# Plotting Distance Efficiency based on seller locations provides key insights into how well
# sellers are optimizing their deliveries over various distances:
#
# 1. Logistical Optimization: This visualization helps identify which sellers or regions are most
#    efficient in managing the distance between their location and customers. Sellers with high
#    distance efficiency are likely optimizing their logistics and transportation.
#
# 2. Regional Efficiency Patterns: By comparing cities, we can determine if certain cities consistently
#    have more efficient sellers, possibly due to better infrastructure, transport networks, or delivery hubs.
#
# 3. High Volume vs. Efficiency Trade-off: By correlating the number of orders with distance efficiency,
#    we can assess if high-volume sellers are sacrificing efficiency for order fulfillment, or if they manage
#    to maintain both high order volumes and efficient delivery practices.
In [ ]:
# @title Distance Efficiency in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    # 'Fortaleza': (-3.7172, -38.5434),  # Commented out for layout space
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = sellers_grouped[
        sellers_grouped.apply(
            lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Create a scatterplot showing distance efficiency with Viridis colormap
    scatter = sns.scatterplot(
        x=city_data['s_geolocation_lng'],
        y=city_data['s_geolocation_lat'],
        hue=city_data['distance_efficiency'],  # Use distance efficiency for color
        palette='viridis',  # Viridis colormap (not inverted, assuming higher efficiency is better)
        size=city_data['num_orders'],  # Size of points based on number of orders
        sizes=(20, 200),  # Adjust sizes to improve visibility
        ax=ax,
        legend=False  # Disable individual legend for the scatter plot
    )

    ax.set_title(f"Distance Efficiency in {city}")
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    # Remove the top, left, and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Create a colorbar for distance efficiency
norm = plt.Normalize(vmin=sellers_grouped['distance_efficiency'].min(), vmax=sellers_grouped['distance_efficiency'].max())
sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)  # Use Viridis colormap
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Distance Efficiency')

# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # Adjust top to give space for the main title
plt.suptitle('Distance Efficiency in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
No description has been provided for this image

Freight share¶

In [ ]:
# Who pays more for transportation?
# We might find a freight ratio by dividing the freight value by the order value.
# This ratio indicates the percentage of the product price that a person had to pay just to get their order delivered.
# For example, if a product costs R$50.00 and the freight value was R$10.00, then the freight ratio is 0.2 or 20%.
# Higher freight ratios are very likely to discourage customers to complete a purchase.
# Due to logistics costs, we expect to see lower freight ratios in densely populated areas and are higher freight ratios on sparsely poulated regions.
In [ ]:
# Plotting Freight Share based on seller locations is essential for understanding the cost dynamics
# of logistics in various regions:
#
# 1. Cost Management: This visualization reveals how much of the total order cost is attributed to
#    freight expenses for different sellers. It allows businesses to identify areas where costs can be
#    optimized, leading to improved profitability.
#
# 2. Performance Benchmarking: By comparing freight shares across cities and sellers, organizations can
#    benchmark performance. High freight share in certain locations might indicate inefficiencies or higher
#    logistical challenges that need to be addressed.
#
# 3. Strategic Decision-Making: Understanding freight share helps in making informed decisions about
#    supplier selection, pricing strategies, and resource allocation. It can guide companies on where to
#    invest in infrastructure improvements or adjust pricing models to better compete.
#
# 4. Customer Experience: High freight costs can impact overall pricing and delivery times. By analyzing
#    freight share, businesses can assess how logistics affect customer satisfaction and make necessary
#    adjustments to meet customer expectations effectively.
In [ ]:
# @title Freight Share in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    # 'Fortaleza': (-3.7172, -38.5434),  # Commented out for layout space
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = sellers_grouped[
        sellers_grouped.apply(
            lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Create a scatterplot showing freight share with inverted Viridis colormap
    scatter = sns.scatterplot(
        x=city_data['s_geolocation_lng'],
        y=city_data['s_geolocation_lat'],
        hue=city_data['freight_share'],  # Use freight share for color
        palette='viridis_r',  # Inverted Viridis (higher freight share is less favorable)
        size=city_data['num_orders'],  # Size of points based on number of orders
        sizes=(20, 200),  # Adjust sizes to improve visibility
        ax=ax,
        legend=False  # Disable individual legend for the scatter plot
    )

    ax.set_title(f"Freight Share in {city}")
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    # Remove the top, left, and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Create a colorbar for freight share
norm = plt.Normalize(vmin=sellers_grouped['freight_share'].min(), vmax=sellers_grouped['freight_share'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)  # Use inverted Viridis colormap
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Freight Share')

# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # Adjust top to give space for the main title
plt.suptitle('Freight Share in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
No description has been provided for this image

Time To Estimate Delivery Days¶

In [ ]:
###
# Plotting the "time to estimate delivery days" based on seller location and order size is important for several reasons:
#
# 1. Identifying Regional Delivery Performance: By visualizing delivery times geographically, businesses can easily identify areas
#    where sellers may face longer delivery delays, enabling targeted interventions to improve logistics and customer service in those
#    regions.
#
# 2. Assessing Impact of Order Volume: The size of the points on the plot indicates the number of orders each seller has filled.
#    This allows businesses to analyze whether high-order-volume sellers are able to maintain timely deliveries, highlighting potential
#    capacity issues that could impact customer satisfaction.
#
# 3. Strategic Resource Allocation: Understanding the relationship between seller location, delivery times, and order size enables
#    businesses to make informed decisions on resource allocation, such as optimizing logistics support or enhancing seller performance
#    in specific areas, ultimately driving operational efficiency and customer satisfaction.
###
In [ ]:
# @title Time to Estimate Delivery Days in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define columns to plot
cols = [
    'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
    'customer_delivery_time_days', 'time_to_estimate_delivery_days',
    'distance_km', 'distance_efficiency', 'freight_cost_per_km', 'price_to_distance',
    'cost_efficiency', 'order_size_per_km', 'total_order_cost_per_km', 'freight_share',
    's_geolocation_lat', 's_geolocation_lng'
]

# Group orders by seller and calculate the mean of specified columns
sellers_grouped = orders_cleaned.groupby('seller_id')[cols].mean().reset_index()

# Count the number of orders each seller has filled
order_count = orders_cleaned.groupby('seller_id').size().reset_index(name='num_orders')
sellers_grouped = sellers_grouped.merge(order_count, on='seller_id')

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    # 'Fortaleza': (-3.7172, -38.5434),  # Commented out for layout space
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = sellers_grouped[
        sellers_grouped.apply(
            lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Create a scatterplot showing time to estimate delivery days with inverted Viridis colormap
    scatter = sns.scatterplot(
        x=city_data['s_geolocation_lng'],
        y=city_data['s_geolocation_lat'],
        hue=city_data['time_to_estimate_delivery_days'],  # Use time to estimate delivery days for color
        palette='viridis_r',  # Inverted Viridis (lower times are better)
        size=city_data['num_orders'],  # Size of points based on number of orders
        sizes=(20, 200),  # Adjust sizes to improve visibility
        ax=ax,
        legend=False  # Disable individual legend for the scatter plot
    )

    ax.set_title(f"Time to Estimate Delivery Days in {city}")
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    # Remove the top, left, and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Create a colorbar for time to estimate delivery days
norm = plt.Normalize(vmin=sellers_grouped['time_to_estimate_delivery_days'].min(), vmax=sellers_grouped['time_to_estimate_delivery_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time to Estimate Delivery Days')

# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # Adjust top to give space for the main title
plt.suptitle('Time to Estimate Delivery Days in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
No description has been provided for this image

Total order cost per km¶

In [ ]:
###
# Plotting the total order cost per km based on seller location and order size is important for several reasons:
#
# 1. Identifying High-Cost Areas: By visualizing total order costs geographically, businesses can pinpoint regions where
#    shipping costs are disproportionately high. This information is critical for optimizing delivery routes and reducing
#    transportation expenses in those specific areas.
#
# 2. Assessing Seller Performance: The size of the points in the plot indicates the number of orders each seller has filled.
#    Analyzing how total order cost per km correlates with order volume allows businesses to evaluate whether high-volume
#    sellers are maintaining cost efficiency. This can inform decisions about supporting sellers that may be struggling with
#    high logistics costs.
#
# 3. Strategic Pricing Adjustments: Understanding the geographic distribution of order costs helps businesses make
#    targeted pricing decisions. For instance, sellers operating in high-cost areas may need to adjust their pricing models to
#    remain competitive, while also considering promotional strategies to attract customers in those regions.
###
In [ ]:
# @title Total Order Cost per km in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Define columns to plot
cols = [
    'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
    'customer_delivery_time_days', 'time_to_estimate_delivery_days',
    'distance_km', 'distance_efficiency', 'freight_cost_per_km', 'price_to_distance',
    'cost_efficiency', 'order_size_per_km', 'total_order_cost_per_km', 'freight_share',
    's_geolocation_lat', 's_geolocation_lng'
]

# Group orders by seller and calculate the mean of specified columns
sellers_grouped = orders_cleaned.groupby('seller_id')[cols].mean().reset_index()

# Count the number of orders each seller has filled
order_count = orders_cleaned.groupby('seller_id').size().reset_index(name='num_orders')
sellers_grouped = sellers_grouped.merge(order_count, on='seller_id')

# Define cities coordinates (latitude, longitude)
cities_coords = {
    'Rio de Janeiro': (-22.9068, -43.1729),
    'Sao Paulo': (-23.5505, -46.6333),
    'Porto Alegre': (-30.0346, -51.2177),
    'Curitiba': (-25.4284, -49.2733),
    # 'Fortaleza': (-3.7172, -38.5434),  # Commented out for layout space
    'Bahia': (-12.9714, -38.5014),
    'Brasilia': (-15.7942, -47.8822)
}

# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
    lat, lng = city_coords
    city_data = sellers_grouped[
        sellers_grouped.apply(
            lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
            axis=1
        )
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning
    return city_data

# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
    ax = axes[i]  # Get subplot grid position
    city_data = filter_city_data(coords)

    # Create a scatterplot showing total order cost per km with inverted Viridis colormap
    scatter = sns.scatterplot(
        x=city_data['s_geolocation_lng'],
        y=city_data['s_geolocation_lat'],
        hue=city_data['total_order_cost_per_km'],  # Use total order cost per km for color
        palette='viridis_r',  # Inverted Viridis if lower cost per km is better
        size=city_data['num_orders'],  # Size of points based on number of orders
        sizes=(20, 200),  # Adjust sizes to improve visibility
        ax=ax,
        legend=False  # Disable individual legend for the scatter plot
    )

    ax.set_title(f"Total Order Cost per km in {city}")
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    # Remove the top, left, and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Create a colorbar for total order cost per km
norm = plt.Normalize(vmin=sellers_grouped['total_order_cost_per_km'].min(), vmax=sellers_grouped['total_order_cost_per_km'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([])  # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Total Order Cost per km')

# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # Adjust top to give space for the main title
plt.suptitle('Total Order Cost per km in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
No description has been provided for this image

Largest sellers¶

In [ ]:
# Metrics we want to analyse the largest sellers by:
# 1) delivery time days
# 2) late deliveries
# 3) freight cost
# 4) product weight
# 5) total payment value
# 6) cost efficiency
# 7) customer reviews

1) Delivery time (in days)¶

In [ ]:
# @title Delivery Performance of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()

# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Get global min and max for delivery time in days for consistent color mapping
global_vmin = orders_cleaned['delivery_time_days'].min()
global_vmax = orders_cleaned['delivery_time_days'].max()

# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
    # Filter data for the current seller
    seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()

    # Check if there's data for the seller
    if not seller_data.empty:
        # Define custom colormap from viridis, inverted
        cmap = colormaps['viridis_r']

        # Normalize based on global min and max delivery time (not per seller)
        norm = Normalize(vmin=global_vmin, vmax=global_vmax)

        # Create a scatterplot of customer locations based on delivery time
        scatter = sns.scatterplot(
            x=seller_data['c_geolocation_lng'],
            y=seller_data['c_geolocation_lat'],
            hue=seller_data['delivery_time_days'],  # Use delivery_time_days as color metric
            palette=cmap,  # Custom colormap
            size=seller_data['total_payment_value'],  # Size based on total_payment_value
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Plot the seller's location as a black dot with a smaller size
        axes[i].scatter(
            seller_data['s_geolocation_lng'].mean(),
            seller_data['s_geolocation_lat'].mean(),
            color='black',
            s=150,  # Size for seller's location
            label='Seller Location'
        )

        # Add subtitle for each plot based on seller ranking
        titles = [
            "Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
            "4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
            "7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
        ]
        axes[i].set_title(titles[i], fontsize=14, pad=20)

        # Set labels for the axes and adjust padding
        axes[i].set_xlabel('Longitude', labelpad=20)
        axes[i].set_ylabel('Latitude', labelpad=20)

        # Remove the top, left, and right spines
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Create a global colorbar for delivery time days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax)  # Global normalization

cbar = plt.colorbar(
    plt.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axes,
    orientation='horizontal',
    fraction=0.05,
    pad=0.12
)
cbar.set_label('Delivery Time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(
    top=0.85,
    bottom=0.18,
    hspace=0.6,
    wspace=0.35
)
plt.suptitle('Delivery Performance of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

2) Delivery delay (days)¶

In [ ]:
# @title Time to Estimate Delivery Days of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()

# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))  # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Get global min and max for time to estimate delivery days for consistent color mapping
global_vmin = orders_cleaned['time_to_estimate_delivery_days'].min()
global_vmax = orders_cleaned['time_to_estimate_delivery_days'].max()

# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
    # Filter data for the current seller
    seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()

    # Check if there's data for the seller
    if not seller_data.empty:
        # Define custom colormap from viridis, inverted
        cmap = colormaps['viridis_r']

        # Normalize based on global min and max time to estimate delivery (not per seller)
        norm = Normalize(vmin=global_vmin, vmax=global_vmax)

        # Create a scatterplot of customer locations based on time to estimate delivery
        scatter = sns.scatterplot(
            x=seller_data['c_geolocation_lng'],
            y=seller_data['c_geolocation_lat'],
            hue=seller_data['time_to_estimate_delivery_days'],  # Use time to estimate delivery as color metric
            palette=cmap,  # Custom colormap
            size=seller_data['total_payment_value'],  # Size based on total_payment_value
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Plot the seller's location as a black dot with a smaller size
        axes[i].scatter(
            seller_data['s_geolocation_lng'].mean(),
            seller_data['s_geolocation_lat'].mean(),
            color='black',
            s=150,  # Size for seller's location
            label='Seller Location'
        )

        # Add subtitle for each plot based on seller ranking
        titles = [
            "Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
            "4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
            "7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
        ]
        axes[i].set_title(titles[i], fontsize=14, pad=20)

        # Set labels for the axes and adjust padding
        axes[i].set_xlabel('Longitude', labelpad=20)
        axes[i].set_ylabel('Latitude', labelpad=20)

        # Remove the top, left, and right spines
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Create a global colorbar for time to estimate delivery days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax)  # Global normalization

cbar = plt.colorbar(
    plt.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axes,
    orientation='horizontal',
    fraction=0.05,
    pad=0.12
)
cbar.set_label('Time to Estimate Delivery Days', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(
    top=0.85,
    bottom=0.18,
    hspace=0.6,
    wspace=0.35
)
plt.suptitle('Time to Estimate Delivery Days of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Average freight cost for customers¶

In [ ]:
# @title Customer Freight Share for 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()

# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))  # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Get global min and max for freight share for consistent color mapping
global_vmin = orders_cleaned['freight_share'].min()
global_vmax = orders_cleaned['freight_share'].max()

# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
    # Filter data for the current seller
    seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()

    # Check if there's data for the seller
    if not seller_data.empty:
        # Define custom colormap from viridis, inverted
        cmap = colormaps['viridis_r']

        # Normalize based on global min and max freight share (not per seller)
        norm = Normalize(vmin=global_vmin, vmax=global_vmax)

        # Create a scatterplot of customer locations based on freight share
        scatter = sns.scatterplot(
            x=seller_data['c_geolocation_lng'],
            y=seller_data['c_geolocation_lat'],
            hue=seller_data['freight_share'],  # Use freight share as color metric
            palette=cmap,  # Custom colormap
            size=seller_data['total_payment_value'],  # Size based on total_payment_value
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Plot the seller's location as a black dot with a smaller size
        axes[i].scatter(
            seller_data['s_geolocation_lng'].mean(),
            seller_data['s_geolocation_lat'].mean(),
            color='black',
            s=150,  # Size for seller's location
            label='Seller Location'
        )

        # Add subtitle for each plot based on seller ranking
        titles = [
            "Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
            "4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
            "7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
        ]
        axes[i].set_title(titles[i], fontsize=14, pad=20)

        # Set labels for the axes and adjust padding
        axes[i].set_xlabel('Longitude', labelpad=20)
        axes[i].set_ylabel('Latitude', labelpad=20)

        # Remove the top, left, and right spines
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Create a global colorbar for freight share
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax)  # Global normalization

cbar = plt.colorbar(
    plt.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axes,
    orientation='horizontal',
    fraction=0.05,
    pad=0.12
)
cbar.set_label('Freight Share', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(
    top=0.85,
    bottom=0.18,
    hspace=0.6,
    wspace=0.35
)
plt.suptitle('Customer Freight Share for 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Cost efficiency¶

In [ ]:
# @title Cost Efficiency of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()

# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))  # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Get global min and max for cost efficiency for consistent color mapping
global_vmin = orders_cleaned['cost_efficiency'].min()
global_vmax = orders_cleaned['cost_efficiency'].max()

# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
    # Filter data for the current seller
    seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()

    # Check if there's data for the seller
    if not seller_data.empty:
        # Define custom colormap from viridis
        cmap = colormaps['viridis']

        # Normalize based on global min and max cost efficiency (not per seller)
        norm = Normalize(vmin=global_vmin, vmax=global_vmax)

        # Create a scatterplot of customer locations based on cost efficiency
        scatter = sns.scatterplot(
            x=seller_data['c_geolocation_lng'],
            y=seller_data['c_geolocation_lat'],
            hue=seller_data['cost_efficiency'],  # Use cost efficiency as color metric
            palette=cmap,  # Custom colormap
            size=seller_data['total_payment_value'],  # Size based on total_payment_value
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Plot the seller's location as a black dot with a smaller size
        axes[i].scatter(
            seller_data['s_geolocation_lng'].mean(),
            seller_data['s_geolocation_lat'].mean(),
            color='black',
            s=150,  # Size for seller's location
            label='Seller Location'
        )

        # Add subtitle for each plot based on seller ranking
        titles = [
            "Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
            "4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
            "7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
        ]
        axes[i].set_title(titles[i], fontsize=14, pad=20)

        # Set labels for the axes and adjust padding
        axes[i].set_xlabel('Longitude', labelpad=20)
        axes[i].set_ylabel('Latitude', labelpad=20)

        # Remove the top, left, and right spines
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Create a global colorbar for cost efficiency
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax)  # Global normalization

cbar = plt.colorbar(
    plt.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axes,
    orientation='horizontal',
    fraction=0.05,
    pad=0.12
)
cbar.set_label('Cost Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(
    top=0.85,
    bottom=0.18,
    hspace=0.6,
    wspace=0.35
)
plt.suptitle('Cost Efficiency of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Customer reviews¶

In [ ]:
# @title Customer Review Score of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()

# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))  # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Get global min and max for review scores for consistent color mapping
global_vmin = orders_cleaned['review_score'].min()
global_vmax = orders_cleaned['review_score'].max()

# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
    # Filter data for the current seller
    seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()

    # Check if there's data for the seller
    if not seller_data.empty:
        # Define custom colormap from viridis
        cmap = colormaps['viridis']

        # Normalize based on global min and max review score (not per seller)
        norm = Normalize(vmin=global_vmin, vmax=global_vmax)

        # Create a scatterplot of customer locations based on review score
        scatter = sns.scatterplot(
            x=seller_data['c_geolocation_lng'],
            y=seller_data['c_geolocation_lat'],
            hue=seller_data['review_score'],  # Use review score as color metric
            palette=cmap,  # Custom colormap
            size=seller_data['total_payment_value'],  # Size based on total_payment_value
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Plot the seller's location as a black dot with a smaller size
        axes[i].scatter(
            seller_data['s_geolocation_lng'].mean(),
            seller_data['s_geolocation_lat'].mean(),
            color='black',
            s=150,  # Size for seller's location
            label='Seller Location'
        )

        # Add subtitle for each plot based on seller ranking
        titles = [
            "Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
            "4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
            "7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
        ]
        axes[i].set_title(titles[i], fontsize=14, pad=20)

        # Set labels for the axes and adjust padding
        axes[i].set_xlabel('Longitude', labelpad=20)
        axes[i].set_ylabel('Latitude', labelpad=20)

        # Remove the top, left, and right spines
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Create a global colorbar for review score
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax)  # Global normalization

cbar = plt.colorbar(
    plt.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axes,
    orientation='horizontal',
    fraction=0.05,
    pad=0.12
)
cbar.set_label('Review Score', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and add a main title
plt.subplots_adjust(
    top=0.85,
    bottom=0.18,
    hspace=0.6,
    wspace=0.35
)
plt.suptitle('Customer Review Score of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Plotting 10 largest product categories¶

Delivery delay (in days)¶

In [ ]:
###
# Visualizing delivery delays (in days) by product category is worthwhile for several reasons:
#
# 1. Understanding Delivery Performance by Category: By plotting delivery delays for different product categories, businesses can
#    identify which categories are experiencing longer delivery times. This insight enables companies to focus on improving
#    logistics and operational efficiencies for specific categories, ensuring that customers receive their orders in a timely
#    manner.
#
# 2. Enhancing Customer Experience: Understanding how delivery delays vary by category can help businesses manage customer
#    expectations more effectively. If certain categories consistently have longer delivery times, businesses can proactively
#    communicate this information to customers, adjust marketing strategies, or implement changes to improve service levels in
#    those categories, ultimately enhancing customer satisfaction.
#
# 3. Informed Inventory and Supplier Management: Analyzing delivery delays by category allows businesses to make data-driven
#    decisions regarding inventory management and supplier selection. If specific categories show persistent delays, businesses
#    can reevaluate their supplier relationships, explore alternative logistics providers, or adjust inventory levels to mitigate
#    these delays and better align supply with customer demand.
###
In [ ]:
# @title Delivery Delay (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Define the mapping of product categories to English
category_mapping = {
    'cama_mesa_banho': 'Bedding & Bath',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'informatica_acessorios': 'Computers & Accessories',
    'moveis_decoracao': 'Furniture & Decoration',
    'utilidades_domesticas': 'Household Items',
    'relogios_presentes': 'Watches & Gifts',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive',
    'brinquedos': 'Toys',
    'cool_stuff': 'Cool Stuff',
    'ferramentas_jardim': 'Tools & Garden',
    'perfumaria': 'Perfumery',
    'bebes': 'Babies',
    'eletronicos': 'Electronics'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Check if there's data for the category
    if not category_data.empty:
        # Get the column data we want to plot
        metric = category_data['time_to_estimate_delivery_days']

        # Define custom colormap from viridis, inverted to show shorter delays in darker shades
        cmap = colormaps['viridis_r']

        # Determine vmin and vmax for color normalization
        vmin = metric.min()
        vmax = metric.max()

        # Ensure vmax is greater than vmin
        if vmin == vmax:
            vmax = vmin + 1  # Ensure there's a range for normalization

        # Normalize based on range of values
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of customer locations based on delivery delay
        scatter = sns.scatterplot(
            x=category_data['c_geolocation_lng'],
            y=category_data['c_geolocation_lat'],
            hue=metric,  # Use delivery delay as color metric
            palette=cmap,  # Custom colormap
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Add subtitle for each plot based on product category (with mapping to English)
        axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Longitude')
        axes[i].set_ylabel('Latitude')

        # Remove the top, left, and right spines for cleaner look
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar as a legend
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Delivery Delay (days)', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Change the tick label size

        # Set ticks with larger intervals
        tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))  # Adjust interval size
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])  # Format ticks as integers

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Delete the empty subplots if there are fewer than 12
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to avoid overlapping titles

# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Look at only late deliveries¶

In [ ]:
# @title Delivery Delay (in days) by Product Category (10 Largest Categories with Late Deliveries Only)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Define the mapping of product categories to English
category_mapping = {
    'cama_mesa_banho': 'Bedding & Bath',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'informatica_acessorios': 'Computers & Accessories',
    'moveis_decoracao': 'Furniture & Decoration',
    'utilidades_domesticas': 'Household Items',
    'relogios_presentes': 'Watches & Gifts',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive',
    'brinquedos': 'Toys',
    'cool_stuff': 'Cool Stuff',
    'ferramentas_jardim': 'Tools & Garden',
    'perfumaria': 'Perfumery',
    'bebes': 'Babies',
    'eletronicos': 'Electronics'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Filter only late deliveries
late_deliveries = orders_cleaned[orders_cleaned['time_to_estimate_delivery_days'] > 0]

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = late_deliveries[late_deliveries['product_category_name'] == category].copy()

    # Check if there's data for the category
    if not category_data.empty:
        # Get the column data we want to plot
        metric = category_data['time_to_estimate_delivery_days']

        # Define custom colormap from viridis, inverted to show shorter delays in darker shades
        cmap = colormaps['viridis_r']

        # Determine vmin and vmax for color normalization
        vmin = metric.min()
        vmax = metric.max()

        # Ensure vmax is greater than vmin
        if vmin == vmax:
            vmax = vmin + 1  # Ensure there's a range for normalization

        # Normalize based on range of values
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of customer locations based on metric
        scatter = sns.scatterplot(
            x=category_data['c_geolocation_lng'],
            y=category_data['c_geolocation_lat'],
            hue=metric,  # Use delivery delay as color metric
            palette=cmap,  # Custom colormap
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Add subtitle for each plot based on product category (with mapping to English)
        axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Longitude')
        axes[i].set_ylabel('Latitude')

        # Remove the top, left, and right spines for cleaner look
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar as a legend
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Delivery delay (days)', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Change the tick label size

        # Set ticks with larger intervals
        tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))  # Adjust interval size
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])  # Format ticks as integers

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to avoid overlapping titles

# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category (Late Deliveries Only)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Delivery time (days)¶

In [ ]:
# @title Delivery Time (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Define the mapping of product categories to English
category_mapping = {
    'cama_mesa_banho': 'Bedding & Bath',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'informatica_acessorios': 'Computers & Accessories',
    'moveis_decoracao': 'Furniture & Decoration',
    'utilidades_domesticas': 'Household Items',
    'relogios_presentes': 'Watches & Gifts',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive',
    'brinquedos': 'Toys',
    'cool_stuff': 'Cool Stuff',
    'ferramentas_jardim': 'Tools & Garden',
    'perfumaria': 'Perfumery',
    'bebes': 'Babies',
    'eletronicos': 'Electronics'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Check if there's data for the category
    if not category_data.empty:
        # Get the column data we want to plot
        metric = category_data['delivery_time_days']

        # Define custom colormap from viridis, inverted to show shorter times in darker shades
        cmap = colormaps['viridis_r']

        # Determine vmin and vmax for color normalization
        vmin = metric.min()
        vmax = metric.max()

        # Ensure vmax is greater than vmin
        if vmin == vmax:
            vmax = vmin + 1  # Ensure there's a range for normalization

        # Normalize based on range of values
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of customer locations based on delivery time
        scatter = sns.scatterplot(
            x=category_data['c_geolocation_lng'],
            y=category_data['c_geolocation_lat'],
            hue=metric,  # Use delivery time as color metric
            palette=cmap,  # Custom colormap
            sizes=(20, 200),  # Adjust min/max size range of customer dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm  # Normalize hue for consistent mapping
        )

        # Add subtitle for each plot based on product category (with mapping to English)
        axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Longitude')
        axes[i].set_ylabel('Latitude')

        # Remove the top, left, and right spines for a cleaner look
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar as a legend
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Delivery time (days)', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Change the tick label size

        # Set ticks with larger intervals
        tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))  # Adjust interval size
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])  # Format ticks as integers

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to avoid overlapping titles

# Set main title for the entire figure
plt.suptitle('Delivery Time (in days) by Product Category', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Sellers by Product Categories¶

Delivery time in days¶

In [ ]:
# @title Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Mapping of product categories from Portuguese to English
category_translation = {
    'cama_mesa_banho': 'Bedding, Bath & Table',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'utilidades_domesticas': 'Household Items',
    'moveis_decoracao': 'Furniture & Decoration',
    'informatica_acessorios': 'Computers & Accessories',
    'relogios_presentes': 'Watches & Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Aggregate seller information by seller_id
    seller_data = category_data.groupby('seller_id').agg({
        's_geolocation_lng': 'mean',  # Mean longitude of seller
        's_geolocation_lat': 'mean',  # Mean latitude of seller
        'delivery_time_days': 'mean',  # Mean delivery time
        'order_id': 'count'           # Count of orders fulfilled by the seller
    }).reset_index()

    # Check if there's data for the category
    if not seller_data.empty:
        # Define custom colormap using inverted viridis for darker shades on shorter times
        cmap = colormaps['viridis_r']

        # Determine vmin and vmax for color normalization
        vmin = seller_data['delivery_time_days'].min()
        vmax = seller_data['delivery_time_days'].max()
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of seller locations based on delivery time
        scatter = sns.scatterplot(
            x=seller_data['s_geolocation_lng'],
            y=seller_data['s_geolocation_lat'],
            hue=seller_data['delivery_time_days'],  # Use delivery time as a color metric
            size=seller_data['order_id'],  # Use order count to size the dots
            sizes=(50, 400),  # Adjust size range of seller dots
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm,  # Normalize hue for consistent mapping
            palette=cmap  # Custom colormap
        )

        # Add subtitle for each plot based on product category (translated to English)
        translated_category = category_translation.get(category, category)  # Fallback to original if not found
        axes[i].set_title(f'Category: {translated_category}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Seller Longitude')
        axes[i].set_ylabel('Seller Latitude')

        # Remove the top, left, and right spines for a cleaner look
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar as a legend for delivery times
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Delivery Time (days)', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Change the tick label size

        # Set ticks with larger intervals
        tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))  # Adjust interval size
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])  # Format ticks as integers

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Now, delete any empty subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to avoid overlapping titles

# Set main title for the entire figure
plt.suptitle('Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Delivery to Estimate (in days)¶

In [ ]:
# @title Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Mapping of product categories from Portuguese to English
category_translation = {
    'cama_mesa_banho': 'Bedding, Bath & Table',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'utilidades_domesticas': 'Household Items',
    'moveis_decoracao': 'Furniture & Decoration',
    'informatica_acessorios': 'Computers & Accessories',
    'relogios_presentes': 'Watches & Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Aggregate seller information by seller_id
    seller_data = category_data.groupby('seller_id').agg({
        's_geolocation_lng': 'mean',   # Mean longitude of seller
        's_geolocation_lat': 'mean',   # Mean latitude of seller
        'time_to_estimate_delivery_days': 'mean',  # Mean delivery time
        'order_id': 'count'            # Count of orders fulfilled by the seller
    }).reset_index()

    # Check if there's data for the category
    if not seller_data.empty:
        # Define custom colormap using inverted viridis for shorter times in darker shades
        cmap = colormaps['viridis_r']

        # Normalize based on range of delivery time values
        vmin = seller_data['time_to_estimate_delivery_days'].min()
        vmax = seller_data['time_to_estimate_delivery_days'].max()
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of seller locations based on delivery time
        scatter = sns.scatterplot(
            x=seller_data['s_geolocation_lng'],
            y=seller_data['s_geolocation_lat'],
            hue=seller_data['time_to_estimate_delivery_days'],  # Delivery time as color metric
            size=seller_data['order_id'],  # Size dots by order count
            sizes=(50, 400),  # Size range for order count
            ax=axes[i],  # Corresponding subplot
            legend=False,  # No individual legend
            hue_norm=norm,  # Normalize hue for consistency
            palette=cmap  # Viridis colormap
        )

        # Translate the category name to English
        translated_category = category_translation.get(category, category)  # Fallback to original if not found
        axes[i].set_title(f'Category: {translated_category}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Seller Longitude')
        axes[i].set_ylabel('Seller Latitude')

        # Remove the top, left, and right spines for clarity
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar for delivery times
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Delivery to Estimate (days)', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Adjust tick label size

        # Set ticks with larger intervals
        tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Set main title for the entire figure
plt.suptitle('Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Cost Efficiency (freight value over delivery time in days)¶

In [ ]:
# @title Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Mapping of product categories from Portuguese to English
category_translation = {
    'cama_mesa_banho': 'Bedding, Bath & Table',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'utilidades_domesticas': 'Household Items',
    'moveis_decoracao': 'Furniture & Decoration',
    'informatica_acessorios': 'Computers & Accessories',
    'relogios_presentes': 'Watches & Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Aggregate seller information by seller_id
    seller_data = category_data.groupby('seller_id').agg({
        's_geolocation_lng': 'mean',   # Mean longitude of seller
        's_geolocation_lat': 'mean',   # Mean latitude of seller
        'cost_efficiency': 'mean',     # Mean cost efficiency
        'order_id': 'count'            # Count of orders fulfilled by the seller
    }).reset_index()

    # Check if there's data for the category
    if not seller_data.empty:
        # Get the cost efficiency metric
        metric = seller_data['cost_efficiency']

        # Use 'viridis' colormap
        cmap = colormaps['viridis']

        # Normalize based on range of cost efficiency values
        vmin, vmax = metric.min(), metric.max()
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of seller locations based on cost efficiency
        scatter = sns.scatterplot(
            x=seller_data['s_geolocation_lng'],
            y=seller_data['s_geolocation_lat'],
            hue=metric,  # Use cost efficiency as color metric
            size=seller_data['order_id'],  # Dot size based on order count
            sizes=(50, 400),  # Size range for dots
            ax=axes[i],  # Corresponding subplot
            legend=False,  # No individual legend
            hue_norm=norm,  # Normalize hue for consistent mapping
            palette=cmap  # Custom colormap
        )

        # Translate the category name to English
        translated_category = category_translation.get(category, category)
        axes[i].set_title(f'Category: {translated_category}', fontsize=14)

        # Set axis labels
        axes[i].set_xlabel('Seller Longitude')
        axes[i].set_ylabel('Seller Latitude')

        # Remove top, left, and right spines for clarity
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        # Add a color bar for cost efficiency
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Cost Efficiency', fontsize=12)
        cbar.ax.tick_params(labelsize=10)

        # Set ticks with larger intervals for readability
        tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])

    else:
        # Hide empty subplot
        axes[i].set_visible(False)

# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Set main title for the figure
plt.suptitle('Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Distance Efficiency (distance in km over delivery time days)¶

In [ ]:
# @title Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps

# Mapping of product categories from Portuguese to English
category_translation = {
    'cama_mesa_banho': 'Bedding, Bath & Table',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'utilidades_domesticas': 'Household Items',
    'moveis_decoracao': 'Furniture & Decoration',
    'informatica_acessorios': 'Computers & Accessories',
    'relogios_presentes': 'Watches & Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Aggregate seller information by seller_id
    seller_data = category_data.groupby('seller_id').agg({
        's_geolocation_lng': 'mean',   # Mean longitude of seller
        's_geolocation_lat': 'mean',   # Mean latitude of seller
        'distance_efficiency': 'mean',     # Mean distance efficiency
        'order_id': 'count'            # Count of orders fulfilled by the seller
    }).reset_index()

    # Check if there's data for the category
    if not seller_data.empty:
        # Get the distance efficiency metric
        metric = seller_data['distance_efficiency']

        # Use 'viridis' colormap
        cmap = colormaps['viridis']

        # Normalize based on range of distance efficiency values
        vmin, vmax = metric.min(), metric.max()
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of seller locations based on distance efficiency
        scatter = sns.scatterplot(
            x=seller_data['s_geolocation_lng'],
            y=seller_data['s_geolocation_lat'],
            hue=metric,  # Use distance efficiency as color metric
            size=seller_data['order_id'],  # Dot size based on order count
            sizes=(50, 400),  # Size range for dots
            ax=axes[i],  # Corresponding subplot
            legend=False,  # No individual legend
            hue_norm=norm,  # Normalize hue for consistent mapping
            palette=cmap  # Custom colormap
        )

        # Translate the category name to English
        translated_category = category_translation.get(category, category)
        axes[i].set_title(f'Category: {translated_category}', fontsize=14)

        # Set axis labels
        axes[i].set_xlabel('Seller Longitude')
        axes[i].set_ylabel('Seller Latitude')

        # Add a color bar for distance efficiency
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Distance Efficiency', fontsize=12)
        cbar.ax.tick_params(labelsize=10)

        # Set ticks with larger intervals for readability
        tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])

    else:
        # Hide empty subplot
        axes[i].set_visible(False)

# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Set main title for the figure
plt.suptitle('Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Freight Share¶

In [ ]:
# @title Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize

# Mapping of product categories from Portuguese to English
category_translation = {
    'cama_mesa_banho': 'Bedding, Bath & Table',
    'beleza_saude': 'Beauty & Health',
    'esporte_lazer': 'Sports & Leisure',
    'utilidades_domesticas': 'Household Items',
    'moveis_decoracao': 'Furniture & Decoration',
    'informatica_acessorios': 'Computers & Accessories',
    'relogios_presentes': 'Watches & Gifts',
    'brinquedos': 'Toys',
    'telefonia': 'Telephony',
    'automotivo': 'Automotive'
}

# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()

# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16))  # 4 rows and 3 columns for 10 plots
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Choose colormap and invert if necessary
colormap = 'viridis'
# Invert colormap if lower values represent better performance
invert_cmap = True  # Set based on metric logic
cmap = plt.cm.get_cmap(colormap + ("_r" if invert_cmap else ""))

# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
    # Filter data for the current product category
    category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()

    # Aggregate seller information by seller_id
    seller_data = category_data.groupby('seller_id').agg({
        's_geolocation_lng': 'mean',   # Mean longitude of seller
        's_geolocation_lat': 'mean',   # Mean latitude of seller
        'freight_share': 'mean',       # Mean freight share
        'order_id': 'count'            # Count of orders fulfilled by the seller
    }).reset_index()

    # Check if there's data for the category
    if not seller_data.empty:
        # Get the column data we want to plot (mean freight share per seller)
        metric = seller_data['freight_share']

        # Set vmin and vmax for color normalization explicitly to 0 and 1
        vmin = 0
        vmax = 1

        # Normalize based on range of values
        norm = Normalize(vmin=vmin, vmax=vmax)

        # Create a scatterplot of seller locations based on metric
        scatter = sns.scatterplot(
            x=seller_data['s_geolocation_lng'],
            y=seller_data['s_geolocation_lat'],
            hue=metric,  # Use freight share as a color metric
            size=seller_data['order_id'],  # Use order count to size the dots
            sizes=(50, 400),  # Increased min/max size range of seller dots based on order count
            ax=axes[i],  # Use the corresponding subplot
            legend=False,  # No legend for individual plots
            hue_norm=norm,  # Normalize hue for consistent mapping
            palette=cmap  # Use Viridis colormap
        )

        # Add subtitle for each plot based on product category
        translated_category = category_translation.get(category, category)  # Fallback to original if not found
        axes[i].set_title(f'Category: {translated_category}', fontsize=14)

        # Set labels for the axes
        axes[i].set_xlabel('Seller Longitude')
        axes[i].set_ylabel('Seller Latitude')

        # Add a color bar as a legend
        cbar = plt.colorbar(
            plt.cm.ScalarMappable(norm=norm, cmap=cmap),
            ax=axes[i],
            orientation='vertical'
        )
        cbar.set_label('Freight Share', fontsize=12)
        cbar.ax.tick_params(labelsize=10)  # Change the tick label size

        # Set ticks manually for the color bar
        tick_positions = np.linspace(vmin, vmax, 6)  # 6 ticks from 0 to 1
        cbar.set_ticks(tick_positions)
        cbar.ax.set_yticklabels([f'{tick:.2f}' for tick in tick_positions])  # Format ticks to 2 decimal places

    else:
        # Hide the empty subplot
        axes[i].set_visible(False)

# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust to avoid overlapping titles

# Set main title for the entire figure
plt.suptitle('Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image